The deepG library offers several options to extract input/target pairs from data. We can differentiate between to main approach:
- Language model: predict a character or several characters in a sequence.
- Label Classification: map a label to a sequence.
Language model
With language model, we mean a model that predicts a character in a sequence. We have several options to determine the output format of the data generator using the output_format argument.
The output_format determines the shape of the output for a language model, i.e. part of a sequence is the input  and another the target . Assume a sequence abcdefg and maxlen = 6. Output correspond as follows
“target_right”: abcdef, g
“target_middle_lstm”: (abc, gfe), d (note reversed order of )
“target_middle_cnn”: abcefg, d
“wavenet”: abcdef, bcdefg
Create dummy data
To test the different language model options, we create a simple dummy data set consisting of a repetition of the sequence AAACCCGGGTTTAAACCC….
vocabulary <- c("A", "C", "G", "T")
base_seq <- "AAACCCGGGTTT"
full_seq <- strrep(base_seq, 50)
df <- data.frame(Header = "header", Sequence = full_seq)
# create training fasta file
train_dir <- tempfile()
dir.create(train_dir)
microseq::writeFasta(df, file.path(train_dir, "train_1.fasta"))
# create validation fasta file (use same data as training)
val_dir <- tempfile()
dir.create(val_dir)
microseq::writeFasta(df, file.path(val_dir, "val_1.fasta"))Predict next character
Say we want to predict the next character in a sequence given the last 5 characters and our text consists of the letters A,C,G,T . First we have to create a model. We may use a model with 1 LSTM and 1 dense layer for predictions.
model <- create_model_lstm_cnn(
  maxlen = 5,
  layer_lstm = c(8),
  layer_dense = c(4),
  learning_rate = 0.1,
  vocabulary_size = 4 # text consists of A,C,G,T
)Next we have to specify the location of our training and validation data and the output format of the data generator
hist <- train_model(train_type = "lm", # running a language model
                    output_format = "target_right", # predict target at end of sequence
                    model = model,
                    path = train_dir,
                    path_val = val_dir,
                    steps_per_epoch = 5, # use 5 batches per epoch
                    train_val_ratio = 0.2, # use 20% of samples for validation compared to train
                    batch_size = 16,
                    epochs = 4)
plot(hist)Predict character in middle of sequence
If we want to predict a character in the middle of a sequence and use LSTM layers, we should split our input into two layers. One layer handles the sequence before and one the input after the target. If, for example
sequence:  ACCGTGGAA
then first input corresponds to ACCG and second to AAGG. We may create a model with two input layers using the create_model_cnn_lstm_target_middle
model <- create_model_lstm_cnn_target_middle(
  maxlen = 5,
  layer_lstm = c(8),
  layer_dense = c(4),
  learning_rate = 0.1,
  vocabulary_size = 4 
)The train_model call is identical to the previous model, except we have to change the output format of the generator by setting output_format = "target_middle_lstm". This reverses the order of the sequence after the target.
hist <- train_model(train_type = "lm", # running a language model
                    output_format = "target_middle_lstm", # predict target in middle of sequence 
                    model = model,
                    path = train_dir,
                    path_val = val_dir,
                    steps_per_epoch = 5, # use 5 batches per epoch
                    train_val_ratio = 0.2, # use 20% of samples for validation compared to train
                    batch_size = 16,
                    epochs = 4)
plot(hist)Masked language model
Here we mask some parts of the input sequence and the model tries to predict the masked regions. Can be used for training BERT-like models. See also this notebook. We can first check how the generator works.
# create dummy training data
nt_seq <- rep(c("A", "C", "G", "T"), each = 25) %>% paste(collapse = "") %>% strrep(10)
df <- data.frame(Sequence = nt_seq, Header = "seq_1")
fasta_path <- tempfile(fileext = ".fasta")
fasta_file <- microseq::writeFasta(df, fasta_path)
masked_lm <- list(mask_rate = 0.10, # replace 10% of input with special mask token
                  random_rate = 0.03, # set 3% of input to random value
                  identity_rate = 0.02, # leave 2% unchanged (and set sample weight to 1)
                  include_sw = TRUE) # 0,1 matrix showing where masking was applied
gen <-  get_generator(path = fasta_path,
                      train_type = "masked_lm",
                      masked_lm = masked_lm,
                      batch_size = 1,
                      n_gram = 1,
                      n_gram_stride = 1,
                      return_int = TRUE,
                      maxlen = 100,
                      vocabulary = c("A", "C", "G", "T"))
z <- gen()
x <- z[[1]]
y <- z[[2]]
sw <- z[[3]]
df <- data.frame(x = x[1, ], y = y[1, ], sw = sw[1, ])
print(head(df), 10)
print(df[ df$sw == 1, ])Create the model architecture.
model <- create_model_transformer(
  maxlen = 100,
  vocabulary_size = 6,
  embed_dim = 16,
  ff_dim = 32,
  pos_encoding = "embedding",
  head_size = 20,
  num_heads = 4,
  layer_dense = 6,
  flatten_method = "none",
  last_layer_activation = "softmax",
  loss_fn = "sparse_categorical_crossentropy",
  solver = "adam",
  learning_rate = 0.005
)Train the model.
batch_size <- 128
masked_lm <- list(mask_rate = 0.10, random_rate = 0.03, identity_rate = 0.02, include_sw = TRUE)
hist <- train_model(model = model,
            # training args
            run_name = "bert_1",
            epochs = 8,
            steps_per_epoch = 75,
            # generator args
            maxlen = 100,
            train_type = "masked_lm",
            path = fasta_path,
            path_val = NULL,
            batch_size = batch_size,
            step = 25,
            masked_lm = masked_lm,
            proportion_per_seq = 0.97,
            return_int = TRUE)
plot(hist)Evaluate the trained model.
gen <-  get_generator(path = fasta_path,
                      train_type = "masked_lm",
                      masked_lm = masked_lm,
                      batch_size = 1,
                      n_gram = 1,
                      n_gram_stride = 1,
                      return_int = TRUE,
                      maxlen = 100,
                      vocabulary = c("A", "C", "G", "T"))
z <- gen()
x <- z[[1]]
y <- z[[2]]
sw <- z[[3]]
pred <- model$predict(x) 
pred <- apply(pred[1,,], 1, which.max) - 1 
df <- data.frame(x = x[1, ], sw = sw[1, ], y = y[1, ], pred = pred)
head(df)
df[df$sw == 1, ]
df2 <- df[df$sw == 1, ]
table(df2$pred, df2$y)Label classification
With label classification, we describe the task of mapping a label to a sequence. For example: given the input ACGACCG, does the sequence belong to a viral or bacterial genome?
deepG offers three options to map a label to a sequence
- the label gets read from the fasta header 
- files from every class are in separate folders 
- get label from csv file 
Create dummy data
To test label classification, we create a simple dummy data set. One class consists of random sequences using just A and C and second class uses just G and T.
# create training fasta files
train_dir_1 <- tempfile()
train_dir_2 <- tempfile()
dir.create(train_dir_1)
dir.create(train_dir_2)
train_dir <- list(train_dir_1, train_dir_2)
for (i in 1:2) {
  
  if (i == 1) {
    vocabulary <- c("A", "C")
    header <- "label_1"
    fasta_name_start <- "label_1_train_file"
  } else {
    vocabulary <- c("G", "T")
    header <- "label_2"
    fasta_name_start <- "label_2_train_file"
  }
  
  create_dummy_data(file_path = train_dir[[i]],
                    num_files = 3,
                    seq_length = 20, 
                    num_seq = 5,
                    header = header,
                    fasta_name_start = fasta_name_start,
                    vocabulary = vocabulary)
}  
# create validation fasta files
val_dir_1 <- tempfile()
val_dir_2 <- tempfile()
dir.create(val_dir_1)
dir.create(val_dir_2)
val_dir <- list(val_dir_1, val_dir_2)
for (i in 1:2) {
  
  if (i == 1) {
    vocabulary <- c("A", "C")
    header <- "label_1"
    fasta_name_start <- "label_1_val_file"
  } else {
    vocabulary <- c("G", "T")
    header <- "label_2"
    fasta_name_start <- "label_2_val_file"
  }
  
  create_dummy_data(file_path = val_dir[[i]],
                    num_files = 3,
                    seq_length = 20, 
                    num_seq = 5,
                    header = header,
                    fasta_name_start = fasta_name_start,
                    vocabulary = vocabulary)
}  Label by folder
In this approach, we put all data from one class into a separate folder. Say we want to classify if a sequence belongs to a viral or bacterial genome. We may put all virus and bacteria files into their own folder. In this case the path and path_val arguments should be vectors, where each entry is the path to one class.
First we have to create a model. We may use a model with 1 LSTM and 1 dense layer for predictions. An input sequence has length 5.
model <- create_model_lstm_cnn(
  maxlen = 5,
  layer_lstm = c(8),
  learning_rate = 0.1,
  layer_dense = c(2), # binary classification
  vocabulary_size = 4 # text consists of A,C,G,T
)
train_model(train_type = "label_folder", # reading label from folder  
            model = model,
            path = c(train_dir_1, # note that path has two entries 
                     train_dir_2),
            path_val = c(val_dir_1,
                         val_dir_2),
            steps_per_epoch = 5, # use 5 batches per epoch
            train_val_ratio = 0.2, 
            batch_size = 8,
            epochs = 2,
            vocabulary_label = c("label_1", "label_2") # names of classes
)Label by fasta header
The fasta headers in our dummy data have the names “label_1” or “label_2”
files <- list.files(train_dir_1, full.names = TRUE)
fasta_file <- microseq::readFasta(files[1])
head(fasta_file)
train_model(train_type = "label_header", # reading label from fasta header  
            model = model,
            path = train_dir,
            path_val = val_dir,
            steps_per_epoch = 5, 
            train_val_ratio = 0.2, 
            batch_size = 8,
            epochs = 2,
            vocabulary_label = c("label_1", "label_2") # names of labels
)Label from csv file
In this approach we extract the sequence label by mapping the current file name to a csv table.
files_1 <- basename(list.files(c(train_dir_1, val_dir_1)))
files_2 <- basename(list.files(c(train_dir_2, val_dir_2)))
file <- c(files_1, files_2)
label_1 <- stringr::str_detect(file, "label_1") %>% as.integer()
label_2 <- stringr::str_detect(file, "label_2") %>% as.integer()
df <- data.frame(file, label_1, label_2)
df
csv_path <- tempfile(fileext = ".csv")
write.csv(df, csv_path, row.names = FALSE)
hist <- train_model(train_type = "label_csv",
                    target_from_csv = csv_path,
                    model = model,
                    path = train_dir,
                    path_val = val_dir,
                    steps_per_epoch = 5,
                    train_val_ratio = 0.2,
                    batch_size = 8,
                    epochs = 2)
plot(hist)Training with rds files
We can also use rds files files as input, where the data must already be preprocessed. We may use the dataset_from_gen function to create rds files from fasta files.
rds_folder_train <- tempfile()
rds_folder_val <- tempfile()
dir.create(rds_folder_train)
dir.create(rds_folder_val)
for (data_type in c("train", "val")) {
  
  if (data_type == "train") {
    output_path <- rds_folder_train
    path_corpus <- train_dir
  } else {
    output_path <- rds_folder_val
    path_corpus <- val_dir
  }
  
  dataset_from_gen(output_path = output_path,
                   iterations = 25, # create 25 rds files 
                   train_type = "label_folder",
                   path_corpus = path_corpus, 
                   batch_size = 128,
                   maxlen = 5,
                   step = 5,
                   vocabulary = c("a", "c", "g", "t"),
                   file_name_start = "batch_")
  
}We created 25 files for training and validation with preprocessed data.
train_files <- list.files(rds_folder_train, full.names = TRUE)
basename(train_files)
example_batch <- readRDS(train_files[1])
x <- example_batch[[1]]
y <- example_batch[[2]]
dim(x)
dim(y)
x[1,,]
y[1,]We can now use these files for training.
model <- create_model_lstm_cnn(
  maxlen = 5,
  layer_lstm = c(8),
  learning_rate = 0.1,
  layer_dense = c(2))
hist <- train_model(train_type = "label_rds",
                    model = model,
                    path = rds_folder_train,
                    path_val = rds_folder_val,
                    steps_per_epoch = 5,
                    format = "rds",
                    batch_size = 8,
                    epochs = 2)
plot(hist)