Skip to contents

Train a CPC (Oord et al.) inspired neural network on genomic data.

Usage

train_model_cpc(
  train_type = "CPC",
  encoder = NULL,
  context = NULL,
  path,
  path_val = NULL,
  path_checkpoint = NULL,
  path_tensorboard = NULL,
  train_val_ratio = 0.2,
  run_name,
  batch_size = 32,
  epochs = 100,
  steps_per_epoch = 2000,
  shuffle_file_order = FALSE,
  initial_epoch = 1,
  seed = 1234,
  path_file_log = TRUE,
  train_val_split_csv = NULL,
  file_limit = NULL,
  proportion_per_seq = NULL,
  max_samples = NULL,
  maxlen = NULL,
  patchlen = NULL,
  nopatches = NULL,
  step = NULL,
  file_filter = NULL,
  stride = 0.4,
  pretrained_model = NULL,
  learningrate = 0.001,
  learningrate_schedule = NULL,
  k = 5,
  stepsmin = 2,
  stepsmax = 3,
  emb_scale = 0.1
)

Arguments

train_type

Either "cpc", "Self-GenomeNet".

encoder

A keras encoder for the cpc function.

context

A keras context model for the cpc function.

path

Path to training data. If train_type is label_folder, should be a vector or list where each entry corresponds to a class (list elements can be directories and/or individual files). If train_type is not label_folder, can be a single directory or file or a list of directories and/or files.

path_val

Path to validation data. See path argument for details.

path_checkpoint

Path to checkpoints folder or NULL. If NULL, checkpoints don't get stored.

path_tensorboard

Path to tensorboard directory or NULL. If NULL, training not tracked on tensorboard.

train_val_ratio

For generator defines the fraction of batches that will be used for validation (compared to size of training data), i.e. one validation iteration processes batch_size \(*\) steps_per_epoch \(*\) train_val_ratio samples. If you use dataset instead of generator and dataset_val is NULL, splits dataset into train/validation data.

run_name

Name of the run. Name will be used to identify output from callbacks.

batch_size

Number of samples used for one network update.

epochs

Number of iterations.

steps_per_epoch

Number of training batches per epoch.

shuffle_file_order

Boolean, whether to go through files sequentially or shuffle beforehand.

initial_epoch

Epoch at which to start training. Note that network will run for (epochs - initial_epochs) rounds and not epochs rounds.

seed

Sets seed for reproducible results.

path_file_log

Write name of files to csv file if path is specified.

train_val_split_csv

A csv file specifying train/validation split. csv file should contain one column named "file" and one column named "type". The "file" column contains names of fasta/fastq files and "type" column specifies if file is used for training or validation. Entries in "type" must be named "train" or "val", otherwise file will not be used for either. path and path_val arguments should be the same. Not implemented for train_type = "label_folder".

file_limit

Integer or NULL. If integer, use only specified number of randomly sampled files for training. Ignored if greater than number of files in path.

proportion_per_seq

Numerical value between 0 and 1. Proportion of sequence to take samples from (use random subsequence).

max_samples

Maximum number of samples to use from one file. If not NULL and file has more than max_samples samples, will randomly choose a subset of max_samples samples.

maxlen

Length of predictor sequence.

patchlen

The length of a patch when splitting the input sequence.

nopatches

The number of patches when splitting the input sequence.

step

Frequency of sampling steps.

file_filter

Vector of file names to use from path_corpus.

stride

The overlap between two patches when splitting the input sequence.

pretrained_model

A pretrained keras model, for which training will be continued

learningrate

A Tensor, floating point value. If a schedule is defines, this value gives the initial learning rate. Defaults to 0.001.

learningrate_schedule

A schedule for a non-constant learning rate over the training. Either "cosine_annealing", "step_decay", or "exp_decay".

k

Value of k for sparse top k categorical accuracy. Defaults to 5.

stepsmin

In CPC, a patch is predicted given another patch. stepsmin defines how many patches between these two should be ignored during prediction.

stepsmax

The maximum distance between the predicted patch and the given patch.

emb_scale

Scales the impact of a patches context.

Value

A list of training metrics.

Examples

if (FALSE) { # reticulate::py_module_available("tensorflow")

#create dummy data
path_train_1 <- tempfile()
path_train_2 <- tempfile()
path_val_1 <- tempfile()
path_val_2 <- tempfile()

for (current_path in c(path_train_1, path_train_2,
                       path_val_1, path_val_2)) {
  dir.create(current_path)
  deepG::create_dummy_data(file_path = current_path,
                           num_files = 3,
                           seq_length = 10,
                           num_seq = 5,
                           vocabulary = c("a", "c", "g", "t"))
}

# create model
encoder <- function(maxlen = NULL,
                    patchlen = NULL,
                    nopatches = NULL,
                    eval = FALSE) {
  if (is.null(nopatches)) {
    nopatches <- nopatchescalc(patchlen, maxlen, patchlen * 0.4)
  }
  inp <- keras::layer_input(shape = c(maxlen, 4))
  stridelen <- as.integer(0.4 * patchlen)
  createpatches <- inp %>%
    keras::layer_reshape(list(maxlen, 4L, 1L), name = "prep_reshape1", dtype = "float32") %>%
    tensorflow::tf$image$extract_patches(
      sizes = list(1L, patchlen, 4L, 1L),
      strides = list(1L, stridelen, 4L, 1L),
      rates = list(1L, 1L, 1L, 1L),
      padding = "VALID",
      name = "prep_patches"
    ) %>%
    keras::layer_reshape(list(nopatches, patchlen, 4L),
                         name = "prep_reshape2") %>%
    tensorflow::tf$reshape(list(-1L, patchlen, 4L),
                           name = "prep_reshape3")

  danQ <- createpatches %>%
    keras::layer_conv_1d(
      input_shape = c(maxlen, 4L),
      filters = 320L,
      kernel_size = 26L,
      activation = "relu"
    ) %>%
    keras::layer_max_pooling_1d(pool_size = 13L, strides = 13L) %>%
    keras::layer_dropout(0.2) %>%
    keras::layer_lstm(units = 320, return_sequences = TRUE) %>%
    keras::layer_dropout(0.5) %>%
    keras::layer_flatten() %>%
    keras::layer_dense(925, activation = "relu")
  patchesback <- danQ %>%
    tensorflow::tf$reshape(list(-1L, tensorflow::tf$cast(nopatches, tensorflow::tf$int16), 925L))
  keras::keras_model(inp, patchesback)
}

context <- function(latents) {
  cres <- latents
  cres_dim = cres$shape
  predictions <-
    cres %>%
    keras::layer_lstm(
      return_sequences = TRUE,
      units = 256,  # WAS: 2048,
      name = paste("context_LSTM_1",
                   sep = ""),
      activation = "relu"
    )
  return(predictions)
}

# train model
temp_dir <- tempdir()
hist <- train_model_cpc(train_type = "CPC",
                        ### cpc functions ###
                        encoder = encoder,
                        context = context,
                        #### Generator settings ####
                        path_checkpoint = temp_dir,
                        path = c(path_train_1, path_train_2),
                        path_val = c(path_val_1, path_val_2),
                        run_name = "TEST",
                        batch_size = 8,
                        epochs = 3,
                        steps_per_epoch = 6,
                        patchlen = 100,
                        nopatches = 8)
                
 
}