Train a CPC (Oord et al.) inspired neural network on genomic data.
Usage
train_model_cpc(
  train_type = "CPC",
  encoder = NULL,
  context = NULL,
  path,
  path_val = NULL,
  path_checkpoint = NULL,
  path_tensorboard = NULL,
  train_val_ratio = 0.2,
  run_name,
  batch_size = 32,
  epochs = 100,
  steps_per_epoch = 2000,
  shuffle_file_order = FALSE,
  initial_epoch = 1,
  seed = 1234,
  path_file_log = TRUE,
  train_val_split_csv = NULL,
  file_limit = NULL,
  proportion_per_seq = NULL,
  max_samples = NULL,
  maxlen = NULL,
  patchlen = NULL,
  nopatches = NULL,
  step = NULL,
  file_filter = NULL,
  stride = 0.4,
  pretrained_model = NULL,
  learningrate = 0.001,
  learningrate_schedule = NULL,
  k = 5,
  stepsmin = 2,
  stepsmax = 3,
  emb_scale = 0.1
)Arguments
- train_type
- Either - "cpc",- "Self-GenomeNet".
- encoder
- A keras encoder for the cpc function. 
- context
- A keras context model for the cpc function. 
- path
- Path to training data. If - train_typeis- label_folder, should be a vector or list where each entry corresponds to a class (list elements can be directories and/or individual files). If- train_typeis not- label_folder, can be a single directory or file or a list of directories and/or files.
- path_val
- Path to validation data. See - pathargument for details.
- path_checkpoint
- Path to checkpoints folder or - NULL. If- NULL, checkpoints don't get stored.
- path_tensorboard
- Path to tensorboard directory or - NULL. If- NULL, training not tracked on tensorboard.
- train_val_ratio
- For generator defines the fraction of batches that will be used for validation (compared to size of training data), i.e. one validation iteration processes - batch_size\(*\)- steps_per_epoch\(*\)- train_val_ratiosamples. If you use dataset instead of generator and- dataset_valis- NULL, splits- datasetinto train/validation data.
- run_name
- Name of the run. Name will be used to identify output from callbacks. 
- batch_size
- Number of samples used for one network update. 
- epochs
- Number of iterations. 
- steps_per_epoch
- Number of training batches per epoch. 
- shuffle_file_order
- Boolean, whether to go through files sequentially or shuffle beforehand. 
- initial_epoch
- Epoch at which to start training. Note that network will run for ( - epochs-- initial_epochs) rounds and not- epochsrounds.
- seed
- Sets seed for reproducible results. 
- path_file_log
- Write name of files to csv file if path is specified. 
- train_val_split_csv
- A csv file specifying train/validation split. csv file should contain one column named - "file"and one column named- "type". The- "file"column contains names of fasta/fastq files and- "type"column specifies if file is used for training or validation. Entries in- "type"must be named- "train"or- "val", otherwise file will not be used for either.- pathand- path_valarguments should be the same. Not implemented for- train_type = "label_folder".
- file_limit
- Integer or - NULL. If integer, use only specified number of randomly sampled files for training. Ignored if greater than number of files in- path.
- proportion_per_seq
- Numerical value between 0 and 1. Proportion of sequence to take samples from (use random subsequence). 
- max_samples
- Maximum number of samples to use from one file. If not - NULLand file has more than- max_samplessamples, will randomly choose a subset of- max_samplessamples.
- maxlen
- Length of predictor sequence. 
- patchlen
- The length of a patch when splitting the input sequence. 
- nopatches
- The number of patches when splitting the input sequence. 
- step
- Frequency of sampling steps. 
- file_filter
- Vector of file names to use from path_corpus. 
- stride
- The overlap between two patches when splitting the input sequence. 
- pretrained_model
- A pretrained keras model, for which training will be continued 
- learningrate
- A Tensor, floating point value. If a schedule is defines, this value gives the initial learning rate. Defaults to 0.001. 
- learningrate_schedule
- A schedule for a non-constant learning rate over the training. Either "cosine_annealing", "step_decay", or "exp_decay". 
- k
- Value of k for sparse top k categorical accuracy. Defaults to 5. 
- stepsmin
- In CPC, a patch is predicted given another patch. stepsmin defines how many patches between these two should be ignored during prediction. 
- stepsmax
- The maximum distance between the predicted patch and the given patch. 
- emb_scale
- Scales the impact of a patches context. 
Examples
if (FALSE) { # reticulate::py_module_available("tensorflow")
#create dummy data
path_train_1 <- tempfile()
path_train_2 <- tempfile()
path_val_1 <- tempfile()
path_val_2 <- tempfile()
for (current_path in c(path_train_1, path_train_2,
                       path_val_1, path_val_2)) {
  dir.create(current_path)
  deepG::create_dummy_data(file_path = current_path,
                           num_files = 3,
                           seq_length = 10,
                           num_seq = 5,
                           vocabulary = c("a", "c", "g", "t"))
}
# create model
encoder <- function(maxlen = NULL,
                    patchlen = NULL,
                    nopatches = NULL,
                    eval = FALSE) {
  if (is.null(nopatches)) {
    nopatches <- nopatchescalc(patchlen, maxlen, patchlen * 0.4)
  }
  inp <- keras::layer_input(shape = c(maxlen, 4))
  stridelen <- as.integer(0.4 * patchlen)
  createpatches <- inp %>%
    keras::layer_reshape(list(maxlen, 4L, 1L), name = "prep_reshape1", dtype = "float32") %>%
    tensorflow::tf$image$extract_patches(
      sizes = list(1L, patchlen, 4L, 1L),
      strides = list(1L, stridelen, 4L, 1L),
      rates = list(1L, 1L, 1L, 1L),
      padding = "VALID",
      name = "prep_patches"
    ) %>%
    keras::layer_reshape(list(nopatches, patchlen, 4L),
                         name = "prep_reshape2") %>%
    tensorflow::tf$reshape(list(-1L, patchlen, 4L),
                           name = "prep_reshape3")
  danQ <- createpatches %>%
    keras::layer_conv_1d(
      input_shape = c(maxlen, 4L),
      filters = 320L,
      kernel_size = 26L,
      activation = "relu"
    ) %>%
    keras::layer_max_pooling_1d(pool_size = 13L, strides = 13L) %>%
    keras::layer_dropout(0.2) %>%
    keras::layer_lstm(units = 320, return_sequences = TRUE) %>%
    keras::layer_dropout(0.5) %>%
    keras::layer_flatten() %>%
    keras::layer_dense(925, activation = "relu")
  patchesback <- danQ %>%
    tensorflow::tf$reshape(list(-1L, tensorflow::tf$cast(nopatches, tensorflow::tf$int16), 925L))
  keras::keras_model(inp, patchesback)
}
context <- function(latents) {
  cres <- latents
  cres_dim = cres$shape
  predictions <-
    cres %>%
    keras::layer_lstm(
      return_sequences = TRUE,
      units = 256,  # WAS: 2048,
      name = paste("context_LSTM_1",
                   sep = ""),
      activation = "relu"
    )
  return(predictions)
}
# train model
temp_dir <- tempdir()
hist <- train_model_cpc(train_type = "CPC",
                        ### cpc functions ###
                        encoder = encoder,
                        context = context,
                        #### Generator settings ####
                        path_checkpoint = temp_dir,
                        path = c(path_train_1, path_train_2),
                        path_val = c(path_val_1, path_val_2),
                        run_name = "TEST",
                        batch_size = 8,
                        epochs = 3,
                        steps_per_epoch = 6,
                        patchlen = 100,
                        nopatches = 8)
                
 
}
