devtools::install_github("GenomeNet/deepG")
library(deepG)
library(magrittr)
The deepG library offers several options to extract input/target pairs from data. We can differentiate between to main approach:
- Language model: predict a character or several characters in a sequence.
- Label Classification: map a label to a sequence.
Language model
With language model, we mean a model that predicts a character in a sequence. We have several options to determine the output format of the data generator using the output_format
argument.
The output_format
determines the shape of the output for a language model, i.e. part of a sequence is the input and another the target . Assume a sequence abcdefg and maxlen = 6
. Output correspond as follows
“target_right”: abcdef, g
“target_middle_lstm”: (abc, gfe), d (note reversed order of )
“target_middle_cnn”: abcefg, d
“wavenet”: abcdef, bcdefg
Create dummy data
To test the different language model options, we create a simple dummy data set consisting of a repetition of the sequence AAACCCGGGTTTAAACCC….
vocabulary <- c("A", "C", "G", "T")
base_seq <- "AAACCCGGGTTT"
full_seq <- strrep(base_seq, 50)
df <- data.frame(Header = "header", Sequence = full_seq)
# create training fasta file
train_dir <- tempfile()
dir.create(train_dir)
microseq::writeFasta(df, file.path(train_dir, "train_1.fasta"))
# create validation fasta file (use same data as training)
val_dir <- tempfile()
dir.create(val_dir)
microseq::writeFasta(df, file.path(val_dir, "val_1.fasta"))
Predict next character
Say we want to predict the next character in a sequence given the last 5 characters and our text consists of the letters A,C,G,T . First we have to create a model. We may use a model with 1 LSTM and 1 dense layer for predictions.
model <- create_model_lstm_cnn(
maxlen = 5,
layer_lstm = c(8),
layer_dense = c(4),
learning_rate = 0.1,
vocabulary_size = 4 # text consists of A,C,G,T
)
Next we have to specify the location of our training and validation data and the output format of the data generator
hist <- train_model(train_type = "lm", # running a language model
output_format = "target_right", # predict target at end of sequence
model = model,
path = train_dir,
path_val = val_dir,
steps_per_epoch = 5, # use 5 batches per epoch
train_val_ratio = 0.2, # use 20% of samples for validation compared to train
batch_size = 16,
epochs = 4)
plot(hist)
Predict character in middle of sequence
If we want to predict a character in the middle of a sequence and use LSTM layers, we should split our input into two layers. One layer handles the sequence before and one the input after the target. If, for example
sequence: ACCGTGGAA
then first input corresponds to ACCG and second to AAGG. We may create a model with two input layers using the create_model_cnn_lstm_target_middle
model <- create_model_lstm_cnn_target_middle(
maxlen = 5,
layer_lstm = c(8),
layer_dense = c(4),
learning_rate = 0.1,
vocabulary_size = 4
)
The train_model
call is identical to the previous model, except we have to change the output format of the generator by setting output_format = "target_middle_lstm"
. This reverses the order of the sequence after the target.
hist <- train_model(train_type = "lm", # running a language model
output_format = "target_middle_lstm", # predict target in middle of sequence
model = model,
path = train_dir,
path_val = val_dir,
steps_per_epoch = 5, # use 5 batches per epoch
train_val_ratio = 0.2, # use 20% of samples for validation compared to train
batch_size = 16,
epochs = 4)
plot(hist)
Masked language model
Here we mask some parts of the input sequence and the model tries to predict the masked regions. Can be used for training BERT-like models. See also this notebook. We can first check how the generator works.
# create dummy training data
nt_seq <- rep(c("A", "C", "G", "T"), each = 25) %>% paste(collapse = "") %>% strrep(10)
df <- data.frame(Sequence = nt_seq, Header = "seq_1")
fasta_path <- tempfile(fileext = ".fasta")
fasta_file <- microseq::writeFasta(df, fasta_path)
masked_lm <- list(mask_rate = 0.10, # replace 10% of input with special mask token
random_rate = 0.03, # set 3% of input to random value
identity_rate = 0.02, # leave 2% unchanged (and set sample weight to 1)
include_sw = TRUE) # 0,1 matrix showing where masking was applied
gen <- get_generator(path = fasta_path,
train_type = "masked_lm",
masked_lm = masked_lm,
batch_size = 1,
n_gram = 1,
n_gram_stride = 1,
return_int = TRUE,
maxlen = 100,
vocabulary = c("A", "C", "G", "T"))
z <- gen()
x <- z[[1]]
y <- z[[2]]
sw <- z[[3]]
df <- data.frame(x = x[1, ], y = y[1, ], sw = sw[1, ])
print(head(df), 10)
print(df[ df$sw == 1, ])
Create the model architecture.
model <- create_model_transformer(
maxlen = 100,
vocabulary_size = 6,
embed_dim = 16,
ff_dim = 32,
pos_encoding = "embedding",
head_size = 20,
num_heads = 4,
layer_dense = 6,
flatten_method = "none",
last_layer_activation = "softmax",
loss_fn = "sparse_categorical_crossentropy",
solver = "adam",
learning_rate = 0.005
)
Train the model.
batch_size <- 128
masked_lm <- list(mask_rate = 0.10, random_rate = 0.03, identity_rate = 0.02, include_sw = TRUE)
hist <- train_model(model = model,
# training args
run_name = "bert_1",
epochs = 8,
steps_per_epoch = 75,
# generator args
maxlen = 100,
train_type = "masked_lm",
path = fasta_path,
path_val = NULL,
batch_size = batch_size,
step = 25,
masked_lm = masked_lm,
proportion_per_seq = 0.97,
return_int = TRUE)
plot(hist)
Evaluate the trained model.
gen <- get_generator(path = fasta_path,
train_type = "masked_lm",
masked_lm = masked_lm,
batch_size = 1,
n_gram = 1,
n_gram_stride = 1,
return_int = TRUE,
maxlen = 100,
vocabulary = c("A", "C", "G", "T"))
z <- gen()
x <- z[[1]]
y <- z[[2]]
sw <- z[[3]]
pred <- model$predict(x)
pred <- apply(pred[1,,], 1, which.max) - 1
df <- data.frame(x = x[1, ], sw = sw[1, ], y = y[1, ], pred = pred)
head(df)
df[df$sw == 1, ]
df2 <- df[df$sw == 1, ]
table(df2$pred, df2$y)
Label classification
With label classification, we describe the task of mapping a label to a sequence. For example: given the input ACGACCG, does the sequence belong to a viral or bacterial genome?
deepG offers three options to map a label to a sequence
the label gets read from the fasta header
files from every class are in separate folders
get label from csv file
Create dummy data
To test label classification, we create a simple dummy data set. One class consists of random sequences using just A and C and second class uses just G and T.
# create training fasta files
train_dir_1 <- tempfile()
train_dir_2 <- tempfile()
dir.create(train_dir_1)
dir.create(train_dir_2)
train_dir <- list(train_dir_1, train_dir_2)
for (i in 1:2) {
if (i == 1) {
vocabulary <- c("A", "C")
header <- "label_1"
fasta_name_start <- "label_1_train_file"
} else {
vocabulary <- c("G", "T")
header <- "label_2"
fasta_name_start <- "label_2_train_file"
}
create_dummy_data(file_path = train_dir[[i]],
num_files = 3,
seq_length = 20,
num_seq = 5,
header = header,
fasta_name_start = fasta_name_start,
vocabulary = vocabulary)
}
# create validation fasta files
val_dir_1 <- tempfile()
val_dir_2 <- tempfile()
dir.create(val_dir_1)
dir.create(val_dir_2)
val_dir <- list(val_dir_1, val_dir_2)
for (i in 1:2) {
if (i == 1) {
vocabulary <- c("A", "C")
header <- "label_1"
fasta_name_start <- "label_1_val_file"
} else {
vocabulary <- c("G", "T")
header <- "label_2"
fasta_name_start <- "label_2_val_file"
}
create_dummy_data(file_path = val_dir[[i]],
num_files = 3,
seq_length = 20,
num_seq = 5,
header = header,
fasta_name_start = fasta_name_start,
vocabulary = vocabulary)
}
Label by folder
In this approach, we put all data from one class into a separate folder. Say we want to classify if a sequence belongs to a viral or bacterial genome. We may put all virus and bacteria files into their own folder. In this case the path
and path_val
arguments should be vectors, where each entry is the path to one class.
First we have to create a model. We may use a model with 1 LSTM and 1 dense layer for predictions. An input sequence has length 5.
model <- create_model_lstm_cnn(
maxlen = 5,
layer_lstm = c(8),
learning_rate = 0.1,
layer_dense = c(2), # binary classification
vocabulary_size = 4 # text consists of A,C,G,T
)
train_model(train_type = "label_folder", # reading label from folder
model = model,
path = c(train_dir_1, # note that path has two entries
train_dir_2),
path_val = c(val_dir_1,
val_dir_2),
steps_per_epoch = 5, # use 5 batches per epoch
train_val_ratio = 0.2,
batch_size = 8,
epochs = 2,
vocabulary_label = c("label_1", "label_2") # names of classes
)
Label by fasta header
The fasta headers in our dummy data have the names “label_1” or “label_2”
files <- list.files(train_dir_1, full.names = TRUE)
fasta_file <- microseq::readFasta(files[1])
head(fasta_file)
train_model(train_type = "label_header", # reading label from fasta header
model = model,
path = train_dir,
path_val = val_dir,
steps_per_epoch = 5,
train_val_ratio = 0.2,
batch_size = 8,
epochs = 2,
vocabulary_label = c("label_1", "label_2") # names of labels
)
Label from csv file
In this approach we extract the sequence label by mapping the current file name to a csv table.
files_1 <- basename(list.files(c(train_dir_1, val_dir_1)))
files_2 <- basename(list.files(c(train_dir_2, val_dir_2)))
file <- c(files_1, files_2)
label_1 <- stringr::str_detect(file, "label_1") %>% as.integer()
label_2 <- stringr::str_detect(file, "label_2") %>% as.integer()
df <- data.frame(file, label_1, label_2)
df
csv_path <- tempfile(fileext = ".csv")
write.csv(df, csv_path, row.names = FALSE)
hist <- train_model(train_type = "label_csv",
target_from_csv = csv_path,
model = model,
path = train_dir,
path_val = val_dir,
steps_per_epoch = 5,
train_val_ratio = 0.2,
batch_size = 8,
epochs = 2)
plot(hist)
Training with rds files
We can also use rds files files as input, where the data must already be preprocessed. We may use the dataset_from_gen
function to create rds files from fasta files.
rds_folder_train <- tempfile()
rds_folder_val <- tempfile()
dir.create(rds_folder_train)
dir.create(rds_folder_val)
for (data_type in c("train", "val")) {
if (data_type == "train") {
output_path <- rds_folder_train
path_corpus <- train_dir
} else {
output_path <- rds_folder_val
path_corpus <- val_dir
}
dataset_from_gen(output_path = output_path,
iterations = 25, # create 25 rds files
train_type = "label_folder",
path_corpus = path_corpus,
batch_size = 128,
maxlen = 5,
step = 5,
vocabulary = c("a", "c", "g", "t"),
file_name_start = "batch_")
}
We created 25 files for training and validation with preprocessed data.
train_files <- list.files(rds_folder_train, full.names = TRUE)
basename(train_files)
example_batch <- readRDS(train_files[1])
x <- example_batch[[1]]
y <- example_batch[[2]]
dim(x)
dim(y)
x[1,,]
y[1,]
We can now use these files for training.
model <- create_model_lstm_cnn(
maxlen = 5,
layer_lstm = c(8),
learning_rate = 0.1,
layer_dense = c(2))
hist <- train_model(train_type = "label_rds",
model = model,
path = rds_folder_train,
path_val = rds_folder_val,
steps_per_epoch = 5,
format = "rds",
batch_size = 8,
epochs = 2)
plot(hist)