Skip to contents

The deepG library offers several options to extract input/target pairs from data. We can differentiate between to main approach:

  • Language model: predict a character or several characters in a sequence.
  • Label Classification: map a label to a sequence.

Language model

With language model, we mean a model that predicts a character in a sequence. We have several options to determine the output format of the data generator using the output_format argument.

The output_format determines the shape of the output for a language model, i.e. part of a sequence is the input XX and another the target YY. Assume a sequence abcdefg and maxlen = 6. Output correspond as follows

“target_right”: X=X=abcdef, Y=Y=g

“target_middle_lstm”: X=X = (X1=X_1 =abc, X2=X_2 =gfe), Y=Y=d (note reversed order of X2X_2)

“target_middle_cnn”: X=X =abcefg, Y=Y =d

“wavenet”: X=X =abcdef, Y=Y =bcdefg

Create dummy data

To test the different language model options, we create a simple dummy data set consisting of a repetition of the sequence AAACCCGGGTTTAAACCC….

vocabulary <- c("A", "C", "G", "T")
base_seq <- "AAACCCGGGTTT"
full_seq <- strrep(base_seq, 50)
df <- data.frame(Header = "header", Sequence = full_seq)

# create training fasta file
train_dir <- tempfile()
dir.create(train_dir)
microseq::writeFasta(df, file.path(train_dir, "train_1.fasta"))
# create validation fasta file (use same data as training)
val_dir <- tempfile()
dir.create(val_dir)
microseq::writeFasta(df, file.path(val_dir, "val_1.fasta"))

Predict next character

Say we want to predict the next character in a sequence given the last 5 characters and our text consists of the letters A,C,G,T . First we have to create a model. We may use a model with 1 LSTM and 1 dense layer for predictions.

model <- create_model_lstm_cnn(
  maxlen = 5,
  layer_lstm = c(8),
  layer_dense = c(4),
  learning_rate = 0.1,
  vocabulary_size = 4 # text consists of A,C,G,T
)

Next we have to specify the location of our training and validation data and the output format of the data generator

hist <- train_model(train_type = "lm", # running a language model
                    output_format = "target_right", # predict target at end of sequence
                    model = model,
                    path = train_dir,
                    path_val = val_dir,
                    steps_per_epoch = 5, # use 5 batches per epoch
                    train_val_ratio = 0.2, # use 20% of samples for validation compared to train
                    batch_size = 16,
                    epochs = 4)
plot(hist)

Predict character in middle of sequence

If we want to predict a character in the middle of a sequence and use LSTM layers, we should split our input into two layers. One layer handles the sequence before and one the input after the target. If, for example

sequence: ACCGTGGAA

then first input corresponds to ACCG and second to AAGG. We may create a model with two input layers using the create_model_cnn_lstm_target_middle

model <- create_model_lstm_cnn_target_middle(
  maxlen = 5,
  layer_lstm = c(8),
  layer_dense = c(4),
  learning_rate = 0.1,
  vocabulary_size = 4 
)

The train_model call is identical to the previous model, except we have to change the output format of the generator by setting output_format = "target_middle_lstm". This reverses the order of the sequence after the target.

hist <- train_model(train_type = "lm", # running a language model
                    output_format = "target_middle_lstm", # predict target in middle of sequence 
                    model = model,
                    path = train_dir,
                    path_val = val_dir,
                    steps_per_epoch = 5, # use 5 batches per epoch
                    train_val_ratio = 0.2, # use 20% of samples for validation compared to train
                    batch_size = 16,
                    epochs = 4)
plot(hist)

Masked language model

Here we mask some parts of the input sequence and the model tries to predict the masked regions. Can be used for training BERT-like models. See also this notebook. We can first check how the generator works.

# create dummy training data
nt_seq <- rep(c("A", "C", "G", "T"), each = 25) %>% paste(collapse = "") %>% strrep(10)
df <- data.frame(Sequence = nt_seq, Header = "seq_1")
fasta_path <- tempfile(fileext = ".fasta")
fasta_file <- microseq::writeFasta(df, fasta_path)
masked_lm <- list(mask_rate = 0.10, # replace 10% of input with special mask token
                  random_rate = 0.03, # set 3% of input to random value
                  identity_rate = 0.02, # leave 2% unchanged (and set sample weight to 1)
                  include_sw = TRUE) # 0,1 matrix showing where masking was applied

gen <-  get_generator(path = fasta_path,
                      train_type = "masked_lm",
                      masked_lm = masked_lm,
                      batch_size = 1,
                      n_gram = 1,
                      n_gram_stride = 1,
                      return_int = TRUE,
                      maxlen = 100,
                      vocabulary = c("A", "C", "G", "T"))

z <- gen()
x <- z[[1]]
y <- z[[2]]
sw <- z[[3]]
df <- data.frame(x = x[1, ], y = y[1, ], sw = sw[1, ])
print(head(df), 10)
print(df[ df$sw == 1, ])

Create the model architecture.

model <- create_model_transformer(
  maxlen = 100,
  vocabulary_size = 6,
  embed_dim = 16,
  ff_dim = 32,
  pos_encoding = "embedding",
  head_size = 20,
  num_heads = 4,
  layer_dense = 6,
  flatten_method = "none",
  last_layer_activation = "softmax",
  loss_fn = "sparse_categorical_crossentropy",
  solver = "adam",
  learning_rate = 0.005
)

Train the model.

batch_size <- 128
masked_lm <- list(mask_rate = 0.10, random_rate = 0.03, identity_rate = 0.02, include_sw = TRUE)

hist <- train_model(model = model,
            # training args
            run_name = "bert_1",
            epochs = 8,
            steps_per_epoch = 75,
            # generator args
            maxlen = 100,
            train_type = "masked_lm",
            path = fasta_path,
            path_val = NULL,
            batch_size = batch_size,
            step = 25,
            masked_lm = masked_lm,
            proportion_per_seq = 0.97,
            return_int = TRUE)

plot(hist)

Evaluate the trained model.

gen <-  get_generator(path = fasta_path,
                      train_type = "masked_lm",
                      masked_lm = masked_lm,
                      batch_size = 1,
                      n_gram = 1,
                      n_gram_stride = 1,
                      return_int = TRUE,
                      maxlen = 100,
                      vocabulary = c("A", "C", "G", "T"))

z <- gen()
x <- z[[1]]
y <- z[[2]]
sw <- z[[3]]
pred <- model$predict(x) 
pred <- apply(pred[1,,], 1, which.max) - 1 
df <- data.frame(x = x[1, ], sw = sw[1, ], y = y[1, ], pred = pred)
head(df)
df[df$sw == 1, ]
df2 <- df[df$sw == 1, ]
table(df2$pred, df2$y)

Label classification

With label classification, we describe the task of mapping a label to a sequence. For example: given the input ACGACCG, does the sequence belong to a viral or bacterial genome?

deepG offers three options to map a label to a sequence

  1. the label gets read from the fasta header

  2. files from every class are in separate folders

  3. get label from csv file

Create dummy data

To test label classification, we create a simple dummy data set. One class consists of random sequences using just A and C and second class uses just G and T.

# create training fasta files
train_dir_1 <- tempfile()
train_dir_2 <- tempfile()
dir.create(train_dir_1)
dir.create(train_dir_2)
train_dir <- list(train_dir_1, train_dir_2)

for (i in 1:2) {
  
  if (i == 1) {
    vocabulary <- c("A", "C")
    header <- "label_1"
    fasta_name_start <- "label_1_train_file"
  } else {
    vocabulary <- c("G", "T")
    header <- "label_2"
    fasta_name_start <- "label_2_train_file"
  }
  
  create_dummy_data(file_path = train_dir[[i]],
                    num_files = 3,
                    seq_length = 20, 
                    num_seq = 5,
                    header = header,
                    fasta_name_start = fasta_name_start,
                    vocabulary = vocabulary)
}  

# create validation fasta files
val_dir_1 <- tempfile()
val_dir_2 <- tempfile()
dir.create(val_dir_1)
dir.create(val_dir_2)
val_dir <- list(val_dir_1, val_dir_2)

for (i in 1:2) {
  
  if (i == 1) {
    vocabulary <- c("A", "C")
    header <- "label_1"
    fasta_name_start <- "label_1_val_file"
  } else {
    vocabulary <- c("G", "T")
    header <- "label_2"
    fasta_name_start <- "label_2_val_file"
  }
  
  create_dummy_data(file_path = val_dir[[i]],
                    num_files = 3,
                    seq_length = 20, 
                    num_seq = 5,
                    header = header,
                    fasta_name_start = fasta_name_start,
                    vocabulary = vocabulary)
}  

Label by folder

In this approach, we put all data from one class into a separate folder. Say we want to classify if a sequence belongs to a viral or bacterial genome. We may put all virus and bacteria files into their own folder. In this case the path and path_val arguments should be vectors, where each entry is the path to one class.

First we have to create a model. We may use a model with 1 LSTM and 1 dense layer for predictions. An input sequence has length 5.

model <- create_model_lstm_cnn(
  maxlen = 5,
  layer_lstm = c(8),
  learning_rate = 0.1,
  layer_dense = c(2), # binary classification
  vocabulary_size = 4 # text consists of A,C,G,T
)
train_model(train_type = "label_folder", # reading label from folder  
            model = model,
            path = c(train_dir_1, # note that path has two entries 
                     train_dir_2),
            path_val = c(val_dir_1,
                         val_dir_2),
            steps_per_epoch = 5, # use 5 batches per epoch
            train_val_ratio = 0.2, 
            batch_size = 8,
            epochs = 2,
            vocabulary_label = c("label_1", "label_2") # names of classes
)

Label by fasta header

The fasta headers in our dummy data have the names “label_1” or “label_2”

files <- list.files(train_dir_1, full.names = TRUE)
fasta_file <- microseq::readFasta(files[1])
head(fasta_file)
train_model(train_type = "label_header", # reading label from fasta header  
            model = model,
            path = train_dir,
            path_val = val_dir,
            steps_per_epoch = 5, 
            train_val_ratio = 0.2, 
            batch_size = 8,
            epochs = 2,
            vocabulary_label = c("label_1", "label_2") # names of labels
)

Label from csv file

In this approach we extract the sequence label by mapping the current file name to a csv table.

files_1 <- basename(list.files(c(train_dir_1, val_dir_1)))
files_2 <- basename(list.files(c(train_dir_2, val_dir_2)))
file <- c(files_1, files_2)
label_1 <- stringr::str_detect(file, "label_1") %>% as.integer()
label_2 <- stringr::str_detect(file, "label_2") %>% as.integer()
df <- data.frame(file, label_1, label_2)
df

csv_path <- tempfile(fileext = ".csv")
write.csv(df, csv_path, row.names = FALSE)

hist <- train_model(train_type = "label_csv",
                    target_from_csv = csv_path,
                    model = model,
                    path = train_dir,
                    path_val = val_dir,
                    steps_per_epoch = 5,
                    train_val_ratio = 0.2,
                    batch_size = 8,
                    epochs = 2)

plot(hist)

Training with rds files

We can also use rds files files as input, where the data must already be preprocessed. We may use the dataset_from_gen function to create rds files from fasta files.

rds_folder_train <- tempfile()
rds_folder_val <- tempfile()
dir.create(rds_folder_train)
dir.create(rds_folder_val)

for (data_type in c("train", "val")) {
  
  if (data_type == "train") {
    output_path <- rds_folder_train
    path_corpus <- train_dir
  } else {
    output_path <- rds_folder_val
    path_corpus <- val_dir
  }
  
  dataset_from_gen(output_path = output_path,
                   iterations = 25, # create 25 rds files 
                   train_type = "label_folder",
                   path_corpus = path_corpus, 
                   batch_size = 128,
                   maxlen = 5,
                   step = 5,
                   vocabulary = c("a", "c", "g", "t"),
                   file_name_start = "batch_")
  
}

We created 25 files for training and validation with preprocessed data.

train_files <- list.files(rds_folder_train, full.names = TRUE)
basename(train_files)

example_batch <- readRDS(train_files[1])
x <- example_batch[[1]]
y <- example_batch[[2]]
dim(x)
dim(y)
x[1,,]
y[1,]

We can now use these files for training.

model <- create_model_lstm_cnn(
  maxlen = 5,
  layer_lstm = c(8),
  learning_rate = 0.1,
  layer_dense = c(2))

hist <- train_model(train_type = "label_rds",
                    model = model,
                    path = rds_folder_train,
                    path_val = rds_folder_val,
                    steps_per_epoch = 5,
                    format = "rds",
                    batch_size = 8,
                    epochs = 2)

plot(hist)