conv_lstm

    # This script demonstrates the use of a convolutional LSTM network.
    # This network is used to predict the next frame of an artificially
    # generated movie which contains moving squares.
    
    library(keras)
    library(abind)
    library(raster)
    
    # Function Definition -----------------------------------------------------
    
    generate_movies <- function(n_samples = 1200, n_frames = 15){
      
      rows <- 80
      cols <- 80
      
      noisy_movies <- array(0, dim = c(n_samples, n_frames, rows, cols))
      shifted_movies <- array(0, dim = c(n_samples, n_frames, rows, cols))
      
      n <- sample(3:8, 1)
      
      for(s in 1:n_samples){
        for(i in 1:n){
          # Initial position
          xstart <- sample(20:60, 1)
          ystart <- sample(20:60, 1)
          
          # Direction of motion
          directionx <- sample(-1:1, 1)
          directiony <- sample(-1:1, 1)
          
          # Size of the square
          w <- sample(2:3, 1)
          
          x_shift <- xstart + directionx*(0:(n_frames))
          y_shift <- ystart + directiony*(0:(n_frames))
          
          for(t in 1:n_frames){
            square_x <- (x_shift[t] - w):(x_shift[t] + w)
            square_y <- (y_shift[t] - w):(y_shift[t] + w)
            
            noisy_movies[s, t, square_x, square_y] <- 
              noisy_movies[s, t, square_x, square_y] + 1
            
            # Make it more robust by adding noise. The idea is that if 
            # during inference, the value of the pixel is not exactly 
            # one; we need to train the network to be robust and still 
            # consider it as a pixel belonging to a square.
            if(runif(1) > 0.5){
              noise_f <- sample(c(-1, 1), 1)
              
              square_x_n <- (x_shift[t] - w - 1):(x_shift[t] + w + 1)
              square_y_n <- (y_shift[t] - w - 1):(y_shift[t] + w + 1)
              
              noisy_movies[s, t, square_x_n, square_y_n] <- 
                noisy_movies[s, t, square_x_n, square_y_n] + noise_f*0.1
              
            }
            
            # Shift the ground truth by 1
            square_x_s <- (x_shift[t+1] - w):(x_shift[t+1] + w)
            square_y_s <- (y_shift[t+1] - w):(y_shift[t+1] + w)
            
            shifted_movies[s, t, square_x_s, square_y_s] <- 
              shifted_movies[s, t, square_x_s, square_y_s] + 1
          }
        }  
      }
      
      # Cut to a 40x40 window
      noisy_movies <- noisy_movies[,,21:60, 21:60]
      shifted_movies = shifted_movies[,,21:60, 21:60]
      
      noisy_movies[noisy_movies > 1] <- 1
      shifted_movies[shifted_movies > 1] <- 1
    
      # Add channel dimension
      noisy_movies <- array_reshape(noisy_movies, c(dim(noisy_movies), 1))
      shifted_movies <- array_reshape(shifted_movies, c(dim(shifted_movies), 1))
      
      list(
        noisy_movies = noisy_movies,
        shifted_movies = shifted_movies
      )
    }
    
    
    # Data Preparation --------------------------------------------------------
    
    # Artificial data generation:
      # Generate movies with 3 to 7 moving squares inside.
      # The squares are of shape 1x1 or 2x2 pixels, which move linearly over time.
      # For convenience we first create movies with bigger width and height (80x80)
      # and at the end we select a 40x40 window.
    movies <- generate_movies(n_samples = 1000, n_frames = 15)
    more_movies <- generate_movies(n_samples = 200, n_frames = 15)
    
    
    # Model definition --------------------------------------------------------
    
    #Initialize model
    model <- keras_model_sequential()
    
    model %>%
    
      # Begin with 2D convolutional LSTM layer
      layer_conv_lstm_2d(
        input_shape = list(NULL,40,40,1), 
        filters = 40, kernel_size = c(3,3),
        padding = "same", 
        return_sequences = TRUE
      ) %>%
      # Normalize the activations of the previous layer
      layer_batch_normalization() %>%
      
      # Add 3x hidden 2D convolutions LSTM layers, with
      # batch normalization layers between
      layer_conv_lstm_2d(
        filters = 40, kernel_size = c(3,3),
        padding = "same", return_sequences = TRUE
      ) %>%
      layer_batch_normalization() %>%
      layer_conv_lstm_2d(
        filters = 40, kernel_size = c(3,3),
        padding = "same", return_sequences = TRUE
      ) %>%
      layer_batch_normalization() %>% 
      layer_conv_lstm_2d(
        filters = 40, kernel_size = c(3,3),
        padding = "same", return_sequences = TRUE
      ) %>%
      layer_batch_normalization() %>%
      
      # Add final 3D convolutional output layer 
      layer_conv_3d(
        filters = 1, kernel_size = c(3,3,3),
        activation = "sigmoid", 
        padding = "same", data_format ="channels_last"
      )
    
    # Prepare model for training
    model %>% compile(
      loss = "binary_crossentropy", 
      optimizer = "adadelta"
    )
    
    model
    
    
    # Training ----------------------------------------------------------------
    
    model %>% fit(
      movies$noisy_movies,
      movies$shifted_movies,
      batch_size = 10,
      epochs = 30, 
      validation_split = 0.05
    )
    
    
    # Visualization  ----------------------------------------------------------------
    
    # Testing the network on one movie
    # feed it with the first 7 positions and then
    # predict the new positions
    
    #Example to visualize on
    which <- 100
    
    track <- more_movies$noisy_movies[which,1:8,,,1]
    track <- array(track, c(1,8,40,40,1))
    
    for (k in 1:15){
      if (k<8){ 
        png(paste0(k,'_animate.png'))
        par(mfrow=c(1,2),bg = 'white')
        (more_movies$noisy_movies[which,k,,,1])  %>% raster() %>% plot() %>% title (main=paste0('Ground_',k)) 
        (more_movies$noisy_movies[which,k,,,1])  %>% raster() %>% plot() %>% title (main=paste0('Ground_',k)) 
        dev.off()
      } else {
        
        # And then compare the predictions to the ground truth
        png(paste0(k,'_animate.png'))
        par(mfrow=c(1,2),bg = 'white')
        (more_movies$noisy_movies[which,k,,,1])  %>% raster() %>% plot() %>% title (main=paste0('Ground_',k))
        
        # Make Prediction
        new_pos <- model %>% predict(track)
       
        # Slice the last row  
        new_pos_loc <- new_pos[1,k,1:40,1:40,1]  
        new_pos_loc  %>% raster() %>% plot() %>% title (main=paste0('Pred_',k))    
        
        # Reshape it
        new_pos <- array(new_pos_loc, c(1,1, 40,40,1))     
        
        # Bind it to the earlier data
        track <- abind(track,new_pos,along = 2)  
        dev.off()
      }
    } 
    
    # Can also create a gif by running
    system("convert -delay 40 *.png animation.gif")