Train a CPC (Oord et al.) inspired neural network on genomic data.
Usage
train_model_cpc(
train_type = "CPC",
encoder = NULL,
context = NULL,
path,
path_val = NULL,
path_checkpoint = NULL,
path_tensorboard = NULL,
train_val_ratio = 0.2,
run_name,
batch_size = 32,
epochs = 100,
steps_per_epoch = 2000,
shuffle_file_order = FALSE,
initial_epoch = 1,
seed = 1234,
path_file_log = TRUE,
train_val_split_csv = NULL,
file_limit = NULL,
proportion_per_seq = NULL,
max_samples = NULL,
maxlen = NULL,
patchlen = NULL,
nopatches = NULL,
step = NULL,
file_filter = NULL,
stride = 0.4,
pretrained_model = NULL,
learningrate = 0.001,
learningrate_schedule = NULL,
k = 5,
stepsmin = 2,
stepsmax = 3,
emb_scale = 0.1
)
Arguments
- train_type
Either
"cpc"
,"Self-GenomeNet"
.- encoder
A keras encoder for the cpc function.
- context
A keras context model for the cpc function.
- path
Path to training data. If
train_type
islabel_folder
, should be a vector or list where each entry corresponds to a class (list elements can be directories and/or individual files). Iftrain_type
is notlabel_folder
, can be a single directory or file or a list of directories and/or files.- path_val
Path to validation data. See
path
argument for details.- path_checkpoint
Path to checkpoints folder or
NULL
. IfNULL
, checkpoints don't get stored.- path_tensorboard
Path to tensorboard directory or
NULL
. IfNULL
, training not tracked on tensorboard.- train_val_ratio
For generator defines the fraction of batches that will be used for validation (compared to size of training data), i.e. one validation iteration processes
batch_size
\(*\)steps_per_epoch
\(*\)train_val_ratio
samples. If you use dataset instead of generator anddataset_val
isNULL
, splitsdataset
into train/validation data.- run_name
Name of the run. Name will be used to identify output from callbacks.
- batch_size
Number of samples used for one network update.
- epochs
Number of iterations.
- steps_per_epoch
Number of training batches per epoch.
- shuffle_file_order
Boolean, whether to go through files sequentially or shuffle beforehand.
- initial_epoch
Epoch at which to start training. Note that network will run for (
epochs
-initial_epochs
) rounds and notepochs
rounds.- seed
Sets seed for reproducible results.
- path_file_log
Write name of files to csv file if path is specified.
- train_val_split_csv
A csv file specifying train/validation split. csv file should contain one column named
"file"
and one column named"type"
. The"file"
column contains names of fasta/fastq files and"type"
column specifies if file is used for training or validation. Entries in"type"
must be named"train"
or"val"
, otherwise file will not be used for either.path
andpath_val
arguments should be the same. Not implemented fortrain_type = "label_folder"
.- file_limit
Integer or
NULL
. If integer, use only specified number of randomly sampled files for training. Ignored if greater than number of files inpath
.- proportion_per_seq
Numerical value between 0 and 1. Proportion of sequence to take samples from (use random subsequence).
- max_samples
Maximum number of samples to use from one file. If not
NULL
and file has more thanmax_samples
samples, will randomly choose a subset ofmax_samples
samples.- maxlen
Length of predictor sequence.
- patchlen
The length of a patch when splitting the input sequence.
- nopatches
The number of patches when splitting the input sequence.
- step
Frequency of sampling steps.
- file_filter
Vector of file names to use from path_corpus.
- stride
The overlap between two patches when splitting the input sequence.
- pretrained_model
A pretrained keras model, for which training will be continued
- learningrate
A Tensor, floating point value. If a schedule is defines, this value gives the initial learning rate. Defaults to 0.001.
- learningrate_schedule
A schedule for a non-constant learning rate over the training. Either "cosine_annealing", "step_decay", or "exp_decay".
- k
Value of k for sparse top k categorical accuracy. Defaults to 5.
- stepsmin
In CPC, a patch is predicted given another patch. stepsmin defines how many patches between these two should be ignored during prediction.
- stepsmax
The maximum distance between the predicted patch and the given patch.
- emb_scale
Scales the impact of a patches context.
Examples
if (FALSE) { # reticulate::py_module_available("tensorflow")
#create dummy data
path_train_1 <- tempfile()
path_train_2 <- tempfile()
path_val_1 <- tempfile()
path_val_2 <- tempfile()
for (current_path in c(path_train_1, path_train_2,
path_val_1, path_val_2)) {
dir.create(current_path)
deepG::create_dummy_data(file_path = current_path,
num_files = 3,
seq_length = 10,
num_seq = 5,
vocabulary = c("a", "c", "g", "t"))
}
# create model
encoder <- function(maxlen = NULL,
patchlen = NULL,
nopatches = NULL,
eval = FALSE) {
if (is.null(nopatches)) {
nopatches <- nopatchescalc(patchlen, maxlen, patchlen * 0.4)
}
inp <- keras::layer_input(shape = c(maxlen, 4))
stridelen <- as.integer(0.4 * patchlen)
createpatches <- inp %>%
keras::layer_reshape(list(maxlen, 4L, 1L), name = "prep_reshape1", dtype = "float32") %>%
tensorflow::tf$image$extract_patches(
sizes = list(1L, patchlen, 4L, 1L),
strides = list(1L, stridelen, 4L, 1L),
rates = list(1L, 1L, 1L, 1L),
padding = "VALID",
name = "prep_patches"
) %>%
keras::layer_reshape(list(nopatches, patchlen, 4L),
name = "prep_reshape2") %>%
tensorflow::tf$reshape(list(-1L, patchlen, 4L),
name = "prep_reshape3")
danQ <- createpatches %>%
keras::layer_conv_1d(
input_shape = c(maxlen, 4L),
filters = 320L,
kernel_size = 26L,
activation = "relu"
) %>%
keras::layer_max_pooling_1d(pool_size = 13L, strides = 13L) %>%
keras::layer_dropout(0.2) %>%
keras::layer_lstm(units = 320, return_sequences = TRUE) %>%
keras::layer_dropout(0.5) %>%
keras::layer_flatten() %>%
keras::layer_dense(925, activation = "relu")
patchesback <- danQ %>%
tensorflow::tf$reshape(list(-1L, tensorflow::tf$cast(nopatches, tensorflow::tf$int16), 925L))
keras::keras_model(inp, patchesback)
}
context <- function(latents) {
cres <- latents
cres_dim = cres$shape
predictions <-
cres %>%
keras::layer_lstm(
return_sequences = TRUE,
units = 256, # WAS: 2048,
name = paste("context_LSTM_1",
sep = ""),
activation = "relu"
)
return(predictions)
}
# train model
temp_dir <- tempdir()
hist <- train_model_cpc(train_type = "CPC",
### cpc functions ###
encoder = encoder,
context = context,
#### Generator settings ####
path_checkpoint = temp_dir,
path = c(path_train_1, path_train_2),
path_val = c(path_val_1, path_val_2),
run_name = "TEST",
batch_size = 8,
epochs = 3,
steps_per_epoch = 6,
patchlen = 100,
nopatches = 8)
}