Thank you @vdw. More code below
My apologies, the batches are size 512. The sequences are sentences of variable length k-mers, which are tokenized and then put into an embedding layer. There are 10 possible output classes and the true label for each sentence is to be classified.
Example shape of a batch.sequence is 208,512, example shape of corresponding batch.label is 512.
I fixed the value error by reducing the num_layers to =1 and making the LSTM non-bidirectional… however, now im rather concerned the classifier is taking the wrong shape of the sequence tensor to make predictions with / trained on. I’m basically not sure whether or not I should transpose the batch.sequence as it enters the RNN… When I try hidden[-1].squeeze(0) instead of hidden.squeeze(0) I can get the >1 layer + bidirectional version to work without errors, I’m not sure what [-1] is doing to make it work.
Code of the Parameters:
Parameters 2
N_LAYERS = 1 # before this was 4 when I got above error, accuracies have dropped from 86%, down to 40% on 10th epoch, the hidden[-1].squeeze(0) allows the >1 layer + bidirectional lstm to work without the above error but I don’t know what [-1] is doing…
INPUT_SIZE = 1500 # input = a 300-dimensional vector for embedding
HIDDEN_SIZE = 275
N_EPOCHS = 10
LEARNING_RATE = 0.0002
BIDIRECTIONAL = False # before this was 4 when I got above error, accuracies have dropped from 86%, down to 40% on 10th epoch, the hidden[-1].squeeze(0) allows the >1 layer + bidirectional lstm to work without the above error but I don’t know what [-1] is doing…
Code of the LSTM class:
Classes
class LSTMClassifier(nn.Module):
"""
USAGE:
model = LSTMClassifier( HIDDEN_SIZE, INPUT_SIZE, VOCAB_SIZE )
model.to( DEVICE )
"""
# initial setup of the RNN, ..
# .. given user parameters, notice we have [at least] 3 layers:
# 1. embedding,
# 2. encoder [x N_LAYERS],
# 3. predictor
def __init__(self, hidden_size, embedding_dim, vocab_size, n_lstm_layers, n_classes, bidirectional): # bespoke @ANDY:@DEBUG:@1818 this may be causing a bug, below is default
#def __init__(self, hidden_size, embedding_dim, vocab_size): # ^
super(LSTMClassifier, self).__init__()
#self.embedding = nn.Embedding(vocab_size, embedding_dim) # @ANDY:@DEBUG:@1818 this leads to error: "RuntimeError: index out of range: Tried to access index 20000 out of table with 19999 rows. at /opt/conda/conda-bld/pytorch_1579022060824/work/aten/src/TH/generic/THTensorEvenMoreMath.cpp:418", so a crude fix is the line below
self.embedding = nn.Embedding(vocab_size +2, embedding_dim) # see ^
self.encoder = nn.LSTM( input_size = embedding_dim,
hidden_size = hidden_size,
num_layers = n_lstm_layers)
#bidirectional = bidirectional) # @20201120:@latest:@nownow: we get input dim 2 error due to bidirectional! # @ANDY:@DEBUG:@1818 this leads to error: "RuntimeError: index out of range: Tried to access index 20000 out of table with 19999 rows. at /opt/conda/conda-bld/pytorch_1579022060824/work/aten/src/TH/generic/THTensorEvenMoreMath.cpp:418", so a crude fix is the line below
#self.predictor = nn.Linear(hidden_size, N_OUT_CLASSES ) # bespoke @ANDY:@DEBUG:@1818 this may be causing a bug, below is default // arg1 = size of input, arg2 = number of output classes, see: https://pytorch.org/docs/stable/nn.html (CTRL+F: "nn.Linear")
# @20201120 - the book doesnt explain why below has arg2 = 2, this is meant to be the size of the output according to: https://pytorch.org/docs/stable/generated/torch.nn.Linear.html
self.predictor = nn.Linear(hidden_size, 10 ) # ^ // arg1 = size of input, arg2 = number of output classes, see: https://pytorch.org/docs/stable/nn.html (CTRL+F: "nn.Linear")
#self.predictor = nn.Linear(hidden_size, n_classes ) # ^ // arg1 = size of input, arg2 = number of output classes, see: https://pytorch.org/docs/stable/nn.html (CTRL+F: "nn.Linear")
#self.flatten_parameters()
# This is how the model makes predictions,
# .. given an input (training: u/ later to calculate losses & backprops )
def forward( self, seq ):
try:
output, (hidden,_) = self.encoder(self.embedding(seq))
# @20201120 - below is before, and the not commented one below that is how it is according ot the book
#preds = self.predictor(hidden[-1].squeeze(0)) # e.g. remove 1D entries from the shape of an array, see: https://docs.scipy.org/doc/numpy/reference/generated/numpy.squeeze.html
#pdb.set_trace() ## @latest:@now
preds = self.predictor(hidden.squeeze(0)) # e.g. remove 1D entries from the shape of an array, see: https://docs.scipy.org/doc/numpy/reference/generated/numpy.squeeze.html
except Exception as ex:
print(ex)
[max(seq[i]) for i in range(seq.shape[0])]
pdb.set_trace()
return preds
Code of the training:
def training( epochs, model, optimizer, criterion, train_iterator, valid_iterator ):
“”"
USAGE:
MODEL = LSTMClassifier( HIDDEN_SIZE, INPUT_SIZE, VOCAB_SIZE )
OPTIMIZER = optim.Adam(model.parameters(), lr=2e-2) # for e.g. of model, see: “class LSTMClassifier”, lr is learning_rate
CRITERION = nn.CrossEntropyLoss() # loss calculation choice, for strictly binary try CTRL+F: “BCE on page 83 of PyTorch for Deep Learning - O’Reilly”
train( N_EPOCHS, MODEL, OPTIMIZER, CRITERION, train_iterator, valid_iterator )“”"
for epoch in range(1, epochs+1):
training_loss = 0.0 # restart valid_loss = 0.0 # ^ # # Training set learning # model.train() # recursion? or is this a method within LSTMClassifier #pdb.set_trace() for batch_idx, batch in enumerate(train_iterator): #pdb.set_trace() #model.zero_grad() # 1. restart [the gradients (?)] optimizer.zero_grad() # 1. restart [the gradients (?)] predict = model( batch.sequence ) # if len(predict.shape)==2: # if num_layers = 1, then we wrap it in a list else, later we get slicing errors # #print("Layers is 1") # predict = [model( batch.sequence )] # 2. predict [the output labels] # #pdb.set_trace() # else: # pass #pdb.set_trace() #################### ## machine test layer 0 #pdb.set_trace() ## @20201120 - debugging - expected input batch+size (2) to match target batch_size (512) @LATEST loss = criterion( predict, batch.label )# 3. loss [check mistaken predictions via criterion] # }} // {{ # ## machine test layer 1 # loss = criterion( predict[-1], batch.label )# 3. loss [check mistaken predictions via criterion] #################### loss.backward() # 4. backward [backpropagate to learn from mistakes] optimizer.step() # 5. optimise [the gradient changes to given learning rate] training_loss += loss.data.item() * batch.sequence.size(0) # accumulate losses for summary prints training_loss /= len(train_iterator) # account for the size of dataset model.eval() # @TODO: wtf? # # Validation set learning # correct = 0 total = 0 for batch_idx, batch in enumerate(valid_iterator): predict = model( batch.sequence ) # 1. prediction # if len(predict.shape)==2: # if num_layers = 1, then we wrap it in a list else, later we get slicing errors # #print("Layers is 1") # predict = [model( batch.sequence )] # 2. predict [the output labels] # #pdb.set_trace() # else: # pass #################### ## machine layer 0 - when num_layers = 2 loss = criterion( predict, batch.label ) # 2. loss # }} // {{ # ## machine layer n - when num_layers = 2 # loss = criterion( predict[-1], batch.label ) # 2. loss #################### valid_loss += loss.data.item() * batch.sequence.size(0) # accumulate losses for validation set ## neater # accuracy updates #################### ## machine test layer 0 - when num_layers = 2 preds = predict.data.max(1, keepdim=True)[1] # getting all predicted output classes for entire batch as a vector # }} // {{ # ## machine test layer n - when num_layers = 2 # preds = predict[-1].data.max(1, keepdim=True)[1] # getting all predicted output classes for entire batch as a vector pdb.set_trace() #################### # ## default - when num_layers = 1 # preds = predict.data.max(1, keepdim=True)[1] # getting all predicted output classes for entire batch as a vector # correct += float(preds.eq(batch.label.data.view_as(preds)).cpu().sum()) # compare predictions to actual validation set class outputs, and sums the correct answers ####################### ## machine test layer 0 - when num_layers = 2 # accuracy updates correct += float(predict.data.max(1, keepdim=True)[1].eq(batch.label.data.view_as(predict.data.max(1, keepdim=True)[1])).cpu().sum()) # }} // {{ # ## machine test layer n - when num_layers = 2 # # accuracy updates # correct += float(predict[-1].data.max(1, keepdim=True)[1].eq(batch.label.data.view_as(predict[-1].data.max(1, keepdim=True)[1])).cpu().sum()) ####################### total += len(preds) valid_loss /= len(valid_iterator) accuracy = (correct/total)*100.0 # # Validation set learning # print("Epoch: {}, Training Loss: {:.2f}, Validation Loss: {:.2f}, Accuracy: {:.2f}%".format( epoch, training_loss, valid_loss, accuracy) )
Code of the train-test-valid Bucket Iterator
Deterministic Results - for testing & reproducibility
seed = 1
torch.cuda.manual_seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic=True
np.random.RandomState(seed)1. getting our data - treating then saving again
print(“Reading pre-processed raw data…”)
d = pd.read_csv( “./data/”+d_filename+“.csv”, header = None, engine = “python”)print(“Preview of raw data…”)
print(d.head(5)) # show some dataprint(“Cleaning & processing raw data…”)
@20201120 - have to slice a different column now, since we have column 0 = pid, default is the commented out line
d[“log2foldexpression_cat”] = d[1].astype(“category”) # create new column, for the output “sentiment” class
d[“log2foldexpression_cat”] = d[2].astype(“category”) # create new column, for the output “sentiment” class
d[“output_class”] = d[“log2foldexpression_cat”].cat.codes # yet a new column, as binary 0 vs. 1print(“Saving processed data, and generating separate smaller sub-sample for testing…”)
device_suffix = DEVICE.replace(“:”,“_”)d_path = “./data/”+d_filename+“_”+device_suffix+“_processed.csv”
d.to_csv( d_path, header = None, index = None ) # save to new file
d.sample(1000).to_csv( “./data/train-processed-sample.csv”, header = None, index = None ) # save small sample for testing on
Code of the (need help the most) generating error rate file
with open(“error_rate_20201120_pids_included_v2.dat”, “w”) as fo: # uncommented below, so that a different error.dat file for testing
#with open(“error_rate.dat”, “w”) as fo:
fo.write("Pid\tPredicted\tActual\n")
lstm_classifier.eval()
correct = 0
total = 0
for batch_idx, batch in enumerate(valid_iterator):
predict = lstm_classifier( batch.sequence ) # @added .T
#predict = model( batch.sequence ) # 1. prediction
# if len(predict.shape)==2: # if num_layers = 1, then we wrap it in a list else, later we get slicing errors
# #print("Layers is 1")
# predict = [model( batch.sequence )] # 2. predict [the output labels]
# #pdb.set_trace()
# else:
# pass
####################
## machine layer 0 - when num_layers = 2
#loss = criterion( predict, batch.label ) # 2. loss
# }} // {{
# ## machine layer n - when num_layers = 2
# loss = criterion( predict[-1], batch.label ) # 2. loss
####################
#valid_loss += loss.data.item() * batch.sequence.size(0) # accumulate losses for validation set
## neater
# accuracy updates
####################
## machine test layer 0 - when num_layers = 2
preds = predict.data.max(1, keepdim=True)[1] # getting all predicted output classes for entire batch as a vector
# }} // {{
# ## machine test layer n - when num_layers = 2
# preds = predict[-1].data.max(1, keepdim=True)[1] # getting all predicted output classes for entire batch as a vector
####################
# ## default - when num_layers = 1
# preds = predict.data.max(1, keepdim=True)[1] # getting all predicted output classes for entire batch as a vector
# correct += float(preds.eq(batch.label.data.view_as(preds)).cpu().sum()) # compare predictions to actual validation set class outputs, and sums the correct answers
#######################
## machine test layer 0 - when num_layers = 2
# accuracy updates
correct += float(predict.data.max(1, keepdim=True)[1].eq(batch.label.data.view_as(predict.data.max(1, keepdim=True)[1])).cpu().sum())
# }} // {{
labels = batch.label.data.view_as(predict.data.max(1, keepdim=True)[1])
# @20201120 - @RawField testing if we can add pids to the error_data.dst file
#pids = batch.pid.data.view_as(predict.data.max(1, keepdim=True)[1])
try:
# @20201120 - @RawFiled again the below one fails, so we just hope to god that keepings pids as a list will map pid to correct label/sequence uncommented below is @default
# pids = batch.pid.view_as(predict.data.max(1, keepdim=True)[1])
pids = batch.pid
except:
print("pids")
pdb.set_trace() # @LATEST
batch.label.data.view_as(predict.data.max(1, keepdim=True)[1])
for i2, pred in enumerate(preds):
# @LATEST
# @20201120:0155hrs the below failed, due to pids being a string and not tensor, so we try to just treat it as a list -- ctrlf:"o we just hope to god"
fo.write(str(pids[i2])+"\t"+str(pred.data.cpu().numpy()[0])+"\t"+str(labels[i2].data.cpu().numpy()[0])+"\n")
# @20201120: failed // uncommented the default below, to add pids @20201120 - testing @RawField
#fo.write(str(pids[i2].data.cpu().numpy()[0])+"\t"+str(pred.data.cpu().numpy()[0])+"\t"+str(labels[i2].data.cpu().numpy()[0])+"\n")
#fo.write(str(pred.data.cpu().numpy()[0])+"\t"+str(labels[i2].data.cpu().numpy()[0])+"\n") # @default original OldPeteV12
# ## machine test layer n - when num_layers = 2
# # accuracy updates
# correct += float(predict[-1].data.max(1, keepdim=True)[1].eq(batch.label.data.view_as(predict[-1].data.max(1, keepdim=True)[1])).cpu().sum())
#######################
total += len(preds)
#valid_loss /= len(valid_iterator)
accuracy = (correct/total)*100.0
2. Defining fields - torchtext allows us to choose input and output data from the dataset
print(“Defining fields for the dataloader…”)
LABEL = data.LabelField() # torchtext.data.labelField() by default sets sequential (text) to False, since it is a numerical output: 0, 1, 2, etc.
INPUT = data.Field( tokenize=“spacy”, lower=True ) # Linguistic Features · spaCy Usage Documentation
import revtok
INPUT = data.Field( tokenize=“revtok”, lower=True ) # Linguistic Features · spaCy Usage Documentation
added: (“pid”, PID) RawField so e.g. AT1G66370.p can be identified in error.dat file
PID = data.RawField()
fields = [ (“pid”, PID), (“sequence”, INPUT), (“log2foldexpression”, None), (“log2foldexpression_cat”, None), (“label”, LABEL) ] # map LABEL and INPUT into the CSV rows
processed_dataset = torchtext.data.TabularDataset(
path = d_path, # full 1.6 mil INPUTs
#path = “./data/train-processed-sample.csv”, # 10k sample
format=“CSV”,
fields=fields,
skip_header=False)3. Split into test, train and validation sets
print(“Splitting data into: training set, validation set and test set…”)
(train, test, valid) = processed_dataset.split(split_ratio = [0.8, 0.1, 0.1]) # proportions of each
LABEL.build_vocab(train) # see below: to debug: “AttributeError: ‘LabelField’ object has no attribute ‘vocab’”
#PID.build_vocab(train) # @20201120 we rturned RawField into Field and then build_vocab, since it will otherwise not be a tensor, which confuses downstream
print( len(train), len(test), len(valid) ) # check they look like the right ratios
TEST 1:
if len(train) + len(test) + len(valid) == len(processed_dataset):
print(“TEST 1 PASSED!”)print(“Preview of examples of the training set…”)
interesting examples from: path = “./data/train-processed-sample.csv”, replicate these by uncommenting the line matching: CTRL+F: “10k sample”
print(vars(train.examples[121])) # e.g. TP: predicted sentiment: “happy”, clearly it is!
print(vars(train.examples[1010])) # e.g. FP: predicted sentiment: “happy”, but is it really happy?
print(vars(train.examples[1])) # e.g. TN: predicted sentiment: “sad”, clearly it is!
print(vars(train.examples[1901])) # e.g. FN: predicted sentiment: “sad”, but is it really sad?print("Maximum number of k-mers for a given promoter is: ",max([len(vars(train.examples[i])[‘sequence’]) for i in range(len(train))]))
input_guess = max([len(vars(train.examples[i])[‘sequence’]) for i in range(len(train))])
4. Building a vocabulary - embedding layer is now added // one-hot-encoder (bad)
print(“Building vocabulary…”)
vocab_size = VOCAB_SIZE # to restrict the vocabulary, which saves memory
INPUT.build_vocab(train, max_size = vocab_size)
#INPUT.build_vocab(train)print(len(INPUT.vocab)) # check how big our vocabulary is, it will always add two additional words:
# … for unknown, and …
# … a padding token u/ to pad our text to roughly the same length.
# NOTE: eos_token and init_token symbols can be specified, which are not default.print("Total number of unique k-mer words is: ",len(INPUT.vocab.freqs.keys()))
print(“Most frequent k-mer words…”)
print(INPUT.vocab.freqs.most_common(10)) # 10 most common words5. Build the Batch DataLoader
print(“Generating batches of data to iterate over training…”)
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits( (train, valid, test), batch_size = BATCH_SIZE, device = DEVICE, sort=False) # @TODO:dataparallel to work with 2 x GPUs
train_iterator, test_iterator, valid_iterator = data.BucketIterator.splits( (train, test, valid), batch_size = BATCH_SIZE, device = DEVICE) # @TODO:dataparallel to work with 2 x GPUs