Train CPC inspired model — train_model

Train a CPC (Oord et al.) inspired neural network on genomic data.

Usage

train_model_cpc(
  train_type = "CPC",
  encoder = NULL,
  context = NULL,
  path,
  path_val = NULL,
  path_checkpoint = NULL,
  path_tensorboard = NULL,
  train_val_ratio = 0.2,
  run_name,
  batch_size = 32,
  epochs = 100,
  steps_per_epoch = 2000,
  shuffle_file_order = FALSE,
  initial_epoch = 1,
  seed = 1234,
  path_file_log = TRUE,
  train_val_split_csv = NULL,
  file_limit = NULL,
  proportion_per_seq = NULL,
  max_samples = NULL,
  maxlen = NULL,
  patchlen = NULL,
  nopatches = NULL,
  step = NULL,
  file_filter = NULL,
  stride = 0.4,
  pretrained_model = NULL,
  learningrate = 0.001,
  learningrate_schedule = NULL,
  k = 5,
  stepsmin = 2,
  stepsmax = 3,
  emb_scale = 0.1
)

Arguments

train_type: Either "cpc", "Self-GenomeNet".
encoder: A keras encoder for the cpc function.
context: A keras context model for the cpc function.
path: Path to training data. If train_type is label_folder, should be a vector or list where each entry corresponds to a class (list elements can be directories and/or individual files). If train_type is not label_folder, can be a single directory or file or a list of directories and/or files.
path_val: Path to validation data. See path argument for details.
path_checkpoint: Path to checkpoints folder or NULL. If NULL, checkpoints don't get stored.
path_tensorboard: Path to tensorboard directory or NULL. If NULL, training not tracked on tensorboard.
train_val_ratio: For generator defines the fraction of batches that will be used for validation (compared to size of training data), i.e. one validation iteration processes batch_size \(*\) steps_per_epoch \(*\) train_val_ratio samples. If you use dataset instead of generator and dataset_val is NULL, splits dataset into train/validation data.
run_name: Name of the run. Name will be used to identify output from callbacks.
batch_size: Number of samples used for one network update.
epochs: Number of iterations.
steps_per_epoch: Number of training batches per epoch.
shuffle_file_order: Boolean, whether to go through files sequentially or shuffle beforehand.
initial_epoch: Epoch at which to start training. Note that network will run for (epochs - initial_epochs) rounds and not epochs rounds.
seed: Sets seed for reproducible results.
path_file_log: Write name of files to csv file if path is specified.
train_val_split_csv: A csv file specifying train/validation split. csv file should contain one column named "file" and one column named "type". The "file" column contains names of fasta/fastq files and "type" column specifies if file is used for training or validation. Entries in "type" must be named "train" or "val", otherwise file will not be used for either. path and path_val arguments should be the same. Not implemented for train_type = "label_folder".
file_limit: Integer or NULL. If integer, use only specified number of randomly sampled files for training. Ignored if greater than number of files in path.
proportion_per_seq: Numerical value between 0 and 1. Proportion of sequence to take samples from (use random subsequence).
max_samples: Maximum number of samples to use from one file. If not NULL and file has more than max_samples samples, will randomly choose a subset of max_samples samples.
maxlen: Length of predictor sequence.
patchlen: The length of a patch when splitting the input sequence.
nopatches: The number of patches when splitting the input sequence.
step: Frequency of sampling steps.
file_filter: Vector of file names to use from path_corpus.
stride: The overlap between two patches when splitting the input sequence.
pretrained_model: A pretrained keras model, for which training will be continued
learningrate: A Tensor, floating point value. If a schedule is defines, this value gives the initial learning rate. Defaults to 0.001.
learningrate_schedule: A schedule for a non-constant learning rate over the training. Either "cosine_annealing", "step_decay", or "exp_decay".
k: Value of k for sparse top k categorical accuracy. Defaults to 5.
stepsmin: In CPC, a patch is predicted given another patch. stepsmin defines how many patches between these two should be ignored during prediction.
stepsmax: The maximum distance between the predicted patch and the given patch.
emb_scale: Scales the impact of a patches context.

Value

A list of training metrics.

Examples

if (FALSE) { # reticulate::py_module_available("tensorflow")

#create dummy data
path_train_1 <- tempfile()
path_train_2 <- tempfile()
path_val_1 <- tempfile()
path_val_2 <- tempfile()

for (current_path in c(path_train_1, path_train_2,
                       path_val_1, path_val_2)) {
  dir.create(current_path)
  deepG::create_dummy_data(file_path = current_path,
                           num_files = 3,
                           seq_length = 10,
                           num_seq = 5,
                           vocabulary = c("a", "c", "g", "t"))
}

# create model
encoder <- function(maxlen = NULL,
                    patchlen = NULL,
                    nopatches = NULL,
                    eval = FALSE) {
  if (is.null(nopatches)) {
    nopatches <- nopatchescalc(patchlen, maxlen, patchlen * 0.4)
  }
  inp <- keras::layer_input(shape = c(maxlen, 4))
  stridelen <- as.integer(0.4 * patchlen)
  createpatches <- inp %>%
    keras::layer_reshape(list(maxlen, 4L, 1L), name = "prep_reshape1", dtype = "float32") %>%
    tensorflow::tf$image$extract_patches(
      sizes = list(1L, patchlen, 4L, 1L),
      strides = list(1L, stridelen, 4L, 1L),
      rates = list(1L, 1L, 1L, 1L),
      padding = "VALID",
      name = "prep_patches"
    ) %>%
    keras::layer_reshape(list(nopatches, patchlen, 4L),
                         name = "prep_reshape2") %>%
    tensorflow::tf$reshape(list(-1L, patchlen, 4L),
                           name = "prep_reshape3")

  danQ <- createpatches %>%
    keras::layer_conv_1d(
      input_shape = c(maxlen, 4L),
      filters = 320L,
      kernel_size = 26L,
      activation = "relu"
    ) %>%
    keras::layer_max_pooling_1d(pool_size = 13L, strides = 13L) %>%
    keras::layer_dropout(0.2) %>%
    keras::layer_lstm(units = 320, return_sequences = TRUE) %>%
    keras::layer_dropout(0.5) %>%
    keras::layer_flatten() %>%
    keras::layer_dense(925, activation = "relu")
  patchesback <- danQ %>%
    tensorflow::tf$reshape(list(-1L, tensorflow::tf$cast(nopatches, tensorflow::tf$int16), 925L))
  keras::keras_model(inp, patchesback)
}

context <- function(latents) {
  cres <- latents
  cres_dim = cres$shape
  predictions <-
    cres %>%
    keras::layer_lstm(
      return_sequences = TRUE,
      units = 256,  # WAS: 2048,
      name = paste("context_LSTM_1",
                   sep = ""),
      activation = "relu"
    )
  return(predictions)
}

# train model
temp_dir <- tempdir()
hist <- train_model_cpc(train_type = "CPC",
                        ### cpc functions ###
                        encoder = encoder,
                        context = context,
                        #### Generator settings ####
                        path_checkpoint = temp_dir,
                        path = c(path_train_1, path_train_2),
                        path_val = c(path_val_1, path_val_2),
                        run_name = "TEST",
                        batch_size = 8,
                        epochs = 3,
                        steps_per_epoch = 6,
                        patchlen = 100,
                        nopatches = 8)
                
 
}