Convolutional LSTM network

cv
Demonstrates the use of a convolutional LSTM network.

This script demonstrates the use of a convolutional LSTM network. This network is used to predict the next frame of an artificially generated movie which contains moving squares.

library(keras)
library(abind)
library(raster)

Function Definition

generate_movies <- function(n_samples = 1200, n_frames = 15) {
  rows <- 80
  cols <- 80
  
  noisy_movies <- array(0, dim = c(n_samples, n_frames, rows, cols))
  shifted_movies <-
    array(0, dim = c(n_samples, n_frames, rows, cols))
  
  n <- sample(3:8, 1)
  
  for (s in 1:n_samples) {
    for (i in 1:n) {
      # Initial position
      xstart <- sample(20:60, 1)
      ystart <- sample(20:60, 1)
      
      # Direction of motion
      directionx <- sample(-1:1, 1)
      directiony <- sample(-1:1, 1)
      
      # Size of the square
      w <- sample(2:3, 1)
      
      x_shift <- xstart + directionx * (0:(n_frames))
      y_shift <- ystart + directiony * (0:(n_frames))
      
      for (t in 1:n_frames) {
        square_x <- (x_shift[t] - w):(x_shift[t] + w)
        square_y <- (y_shift[t] - w):(y_shift[t] + w)
        
        noisy_movies[s, t, square_x, square_y] <-
          noisy_movies[s, t, square_x, square_y] + 1
        
        # Make it more robust by adding noise. The idea is that if
        # during inference, the value of the pixel is not exactly
        # one; we need to train the network to be robust and still
        # consider it as a pixel belonging to a square.
        if (runif(1) > 0.5) {
          noise_f <- sample(c(-1, 1), 1)
          
          square_x_n <- (x_shift[t] - w - 1):(x_shift[t] + w + 1)
          square_y_n <- (y_shift[t] - w - 1):(y_shift[t] + w + 1)
          
          noisy_movies[s, t, square_x_n, square_y_n] <-
            noisy_movies[s, t, square_x_n, square_y_n] + noise_f * 0.1
          
        }
        
        # Shift the ground truth by 1
        square_x_s <- (x_shift[t + 1] - w):(x_shift[t + 1] + w)
        square_y_s <- (y_shift[t + 1] - w):(y_shift[t + 1] + w)
        
        shifted_movies[s, t, square_x_s, square_y_s] <-
          shifted_movies[s, t, square_x_s, square_y_s] + 1
      }
    }
  }
  
  # Cut to a 40x40 window
  noisy_movies <- noisy_movies[, , 21:60, 21:60]
  shifted_movies = shifted_movies[, , 21:60, 21:60]
  
  noisy_movies[noisy_movies > 1] <- 1
  shifted_movies[shifted_movies > 1] <- 1
  
  # Add channel dimension
  noisy_movies <-
    array_reshape(noisy_movies, c(dim(noisy_movies), 1))
  shifted_movies <-
    array_reshape(shifted_movies, c(dim(shifted_movies), 1))
  
  list(noisy_movies = noisy_movies,
       shifted_movies = shifted_movies)
}

Data Preparation

Artificial data generation:

  • Generate movies with 3 to 7 moving squares inside.
  • The squares are of shape 1x1 or 2x2 pixels, which move linearly over time.
  • For convenience we first create movies with bigger width and height (80x80)
  • and at the end we select a 40x40 window.
movies <- generate_movies(n_samples = 1000, n_frames = 15)
more_movies <- generate_movies(n_samples = 200, n_frames = 15)

Model definition

#Initialize model
model <- keras_model_sequential()

model %>%

  # Begin with 2D convolutional LSTM layer
  layer_conv_lstm_2d(
    input_shape = list(NULL,40,40,1), 
    filters = 40, kernel_size = c(3,3),
    padding = "same", 
    return_sequences = TRUE
  ) %>%
  # Normalize the activations of the previous layer
  layer_batch_normalization() %>%
  
  # Add 3x hidden 2D convolutions LSTM layers, with
  # batch normalization layers between
  layer_conv_lstm_2d(
    filters = 40, kernel_size = c(3,3),
    padding = "same", return_sequences = TRUE
  ) %>%
  layer_batch_normalization() %>%
  layer_conv_lstm_2d(
    filters = 40, kernel_size = c(3,3),
    padding = "same", return_sequences = TRUE
  ) %>%
  layer_batch_normalization() %>% 
  layer_conv_lstm_2d(
    filters = 40, kernel_size = c(3,3),
    padding = "same", return_sequences = TRUE
  ) %>%
  layer_batch_normalization() %>%
  
  # Add final 3D convolutional output layer 
  layer_conv_3d(
    filters = 1, kernel_size = c(3,3,3),
    activation = "sigmoid", 
    padding = "same", data_format ="channels_last"
  )

# Prepare model for training
model %>% compile(
  loss = "binary_crossentropy", 
  optimizer = "adadelta"
)

model

Training

model %>% fit(
  movies$noisy_movies,
  movies$shifted_movies,
  batch_size = 10,
  epochs = 30, 
  validation_split = 0.05
)

Visualization

# Testing the network on one movie
# feed it with the first 7 positions and then
# predict the new positions

#Example to visualize on
which <- 100

track <- more_movies$noisy_movies[which,1:8,,,1]
track <- array(track, c(1,8,40,40,1))

for (k in 1:15){
  if (k<8){ 
    png(paste0(k,'_animate.png'))
    par(mfrow=c(1,2),bg = 'white')
    (more_movies$noisy_movies[which,k,,,1])  %>% raster() %>% plot() %>% title (main=paste0('Ground_',k)) 
    (more_movies$noisy_movies[which,k,,,1])  %>% raster() %>% plot() %>% title (main=paste0('Ground_',k)) 
    dev.off()
  } else {
    
    # And then compare the predictions to the ground truth
    png(paste0(k,'_animate.png'))
    par(mfrow=c(1,2),bg = 'white')
    (more_movies$noisy_movies[which,k,,,1])  %>% raster() %>% plot() %>% title (main=paste0('Ground_',k))
    
    # Make Prediction
    new_pos <- model %>% predict(track)
   
    # Slice the last row  
    new_pos_loc <- new_pos[1,k,1:40,1:40,1]  
    new_pos_loc  %>% raster() %>% plot() %>% title (main=paste0('Pred_',k))    
    
    # Reshape it
    new_pos <- array(new_pos_loc, c(1,1, 40,40,1))     
    
    # Bind it to the earlier data
    track <- abind(track,new_pos,along = 2)  
    dev.off()
  }
} 


# Can also create a gif by running
system("convert -delay 40 *.png animation.gif")

animation