Cannot adapt seq-to-binary lstm classifier into 10x classifier due to: ValueError: Expected input batch_size (2) to match target batch_size (512)

Andrew_Brockman · November 20, 2020, 5:12am

Thank you in advance.

I have been trying to turn a binary classifier into a seq-to-10classifier, where tokenized DNA sequence (variable length k-mers) e.g. “ATG A T A C” would have a predicted output class from {0,1,2,3,…9}.

The issue is that I am working with batches of size 512, but when I set the number of classes in the predict function of the lstm model to 10 or 2 or whatever number I simply get:

Expected input batch_size (2) to match target batch_size (512).

In the training loop, even when I hidden.squeeze(), in my forward method:

	def forward( self, seq ):

		try:
			output, (hidden,_) 	= self.encoder(self.embedding(seq))
			# @20201120 - below is before, and the not commented one below that is how it is according ot the book
			#preds 				= self.predictor(hidden[-1].squeeze(0))  # e.g. remove 1D entries from the shape of an array, see: https://docs.scipy.org/doc/numpy/reference/generated/numpy.squeeze.html
			preds 				= self.predictor(hidden.squeeze(0))  # e.g. remove 1D entries from the shape of an array, see: https://docs.scipy.org/doc/numpy/reference/generated/numpy.squeeze.html

		except Exception as ex:
			print(ex)
			[max(seq[i]) for i in range(seq.shape[0])]
			pdb.set_trace()

		return preds

The full training function is here:

def training( epochs, model, optimizer, criterion, train_iterator, valid_iterator ):

	"""

	USAGE:
		MODEL 		= LSTMClassifier( HIDDEN_SIZE, INPUT_SIZE, VOCAB_SIZE )
		OPTIMIZER 	= optim.Adam(model.parameters(), lr=2e-2)  	# for e.g. of model, see: "class LSTMClassifier", lr is learning_rate
		CRITERION 	= nn.CrossEntropyLoss() 					# loss calculation choice, for strictly binary try CTRL+F: "BCE on page 83 of PyTorch for Deep Learning - O'Reilly"
		train( N_EPOCHS, MODEL, OPTIMIZER, CRITERION, train_iterator, valid_iterator )

	"""

	for epoch in range(1, epochs+1):

		training_loss 	= 0.0 # restart
		valid_loss 		= 0.0 # ^

		#
		# Training set learning
		#
		model.train() 	# recursion? or is this a method within LSTMClassifier

		#pdb.set_trace()

		for batch_idx, batch in enumerate(train_iterator):

			#pdb.set_trace()

			#model.zero_grad() 						# 1. restart [the gradients (?)]

			optimizer.zero_grad() 						# 1. restart [the gradients (?)]

			predict = model( batch.sequence )

			# if len(predict.shape)==2: # if num_layers = 1, then we wrap it in a list else, later we get slicing errors
			# 	#print("Layers is 1")
			# 	predict = [model( batch.sequence )] 			# 2. predict [the output labels]
			# 	#pdb.set_trace()
			# else:
			# 	pass
			
			#pdb.set_trace()

			####################

			## machine test layer 0 

			pdb.set_trace() ## @20201120 - debugging - expected input batch+size (2) to match target batch_size (512) @LATEST

			loss = criterion( predict, batch.label )# 3. loss [check mistaken predictions via criterion]

			# }} // {{ 

			# ## machine test layer 1
			# loss = criterion( predict[-1], batch.label )# 3. loss [check mistaken predictions via criterion]

			####################

			loss.backward() 						# 4. backward [backpropagate to learn from mistakes]
			optimizer.step() 						# 5. optimise [the gradient changes to given learning rate]

			training_loss += loss.data.item() * batch.sequence.size(0) # accumulate losses for summary prints

		training_loss /= len(train_iterator) # account for the size of dataset

		model.eval() # @TODO: wtf?

		#
		# Validation set learning
		#
		correct = 0
		total 	= 0
		for batch_idx, batch in enumerate(valid_iterator):

			predict = model( batch.sequence ) 				# 1. prediction
			
			# if len(predict.shape)==2: # if num_layers = 1, then we wrap it in a list else, later we get slicing errors
			# 	#print("Layers is 1")
			# 	predict = [model( batch.sequence )] 			# 2. predict [the output labels]
			# 	#pdb.set_trace()
			# else:
			# 	pass


			####################

			## machine layer 0 - when num_layers = 2
			loss = criterion( predict, batch.label ) 	# 2. loss 

			# }} // {{

			# ## machine layer n - when num_layers = 2
			# loss = criterion( predict[-1], batch.label ) 	# 2. loss 
		
			####################

			valid_loss += loss.data.item() * batch.sequence.size(0) 	#  accumulate losses for validation set

			## neater
			# accuracy updates

			####################

			## machine test layer 0 - when num_layers = 2
			preds 	= predict.data.max(1, keepdim=True)[1]  # getting all predicted output classes for entire batch as a vector

			# }} // {{ 

			# ## machine test layer n - when num_layers = 2
			# preds 	= predict[-1].data.max(1, keepdim=True)[1]  # getting all predicted output classes for entire batch as a vector

			####################


			# ## default - when num_layers = 1 
			# preds 	= predict.data.max(1, keepdim=True)[1]  # getting all predicted output classes for entire batch as a vector
			# correct += float(preds.eq(batch.label.data.view_as(preds)).cpu().sum()) # compare predictions to actual validation set class outputs, and sums the correct answers
			
			#######################

			## machine test layer 0  - when num_layers = 2
			# accuracy updates
			correct += float(predict.data.max(1, keepdim=True)[1].eq(batch.label.data.view_as(predict.data.max(1, keepdim=True)[1])).cpu().sum())

			# }} // {{ 

			# ## machine test layer n  - when num_layers = 2
			# # accuracy updates
			# correct += float(predict[-1].data.max(1, keepdim=True)[1].eq(batch.label.data.view_as(predict[-1].data.max(1, keepdim=True)[1])).cpu().sum())

			#######################			

			total += len(preds)

		valid_loss /= len(valid_iterator)
		accuracy = (correct/total)*100.0

		#
		# Validation set learning
		#
		print("Epoch: {}, Training Loss: {:.2f}, Validation Loss: {:.2f}, Accuracy: {:.2f}%".format( 	epoch, 
																					training_loss, 
																					valid_loss,
																					accuracy) )

Abhilash_Srivastava · November 20, 2020, 10:15am

What do you mean by seq-to-10 classifier? Do you mean multi-class classifier with 10 classes?

If that’s the case, your model should output a tensor with 10 values (coming from the output of the last linear layer). Hence, the predict should be of shape like [batch_size, num_classes]. Also, your target should be constructed such that there’s one target value for each sequence (in the batch). Hence, the shape of the batch.label should be like [batch_size, 1].
Once, you can ensure this, the error should be resolved. To keep the things simple, try with batch_size = 1.

vdw · November 20, 2020, 1:30pm

There’s not enough code or detail to help. It’s the shape of your batches are off, but you don’t show any of this. What are the shapes of inputs and targets of each batch?

Andrew_Brockman · November 20, 2020, 10:01pm

Thank you @vdw. More code below

My apologies, the batches are size 512. The sequences are sentences of variable length k-mers, which are tokenized and then put into an embedding layer. There are 10 possible output classes and the true label for each sentence is to be classified.

Example shape of a batch.sequence is 208,512, example shape of corresponding batch.label is 512.

I fixed the value error by reducing the num_layers to =1 and making the LSTM non-bidirectional… however, now im rather concerned the classifier is taking the wrong shape of the sequence tensor to make predictions with / trained on. I’m basically not sure whether or not I should transpose the batch.sequence as it enters the RNN… When I try hidden[-1].squeeze(0) instead of hidden.squeeze(0) I can get the >1 layer + bidirectional version to work without errors, I’m not sure what [-1] is doing to make it work.

Code of the Parameters:

Parameters 2

N_LAYERS = 1 # before this was 4 when I got above error, accuracies have dropped from 86%, down to 40% on 10th epoch, the hidden[-1].squeeze(0) allows the >1 layer + bidirectional lstm to work without the above error but I don’t know what [-1] is doing…
INPUT_SIZE = 1500 # input = a 300-dimensional vector for embedding
HIDDEN_SIZE = 275
N_EPOCHS = 10
LEARNING_RATE = 0.0002
BIDIRECTIONAL = False # before this was 4 when I got above error, accuracies have dropped from 86%, down to 40% on 10th epoch, the hidden[-1].squeeze(0) allows the >1 layer + bidirectional lstm to work without the above error but I don’t know what [-1] is doing…

Code of the LSTM class:

Classes

class LSTMClassifier(nn.Module):

"""
USAGE:
	model = LSTMClassifier( HIDDEN_SIZE, INPUT_SIZE, VOCAB_SIZE )
	model.to( DEVICE )
"""

# initial setup of the RNN, ..
# .. given user parameters, notice we have [at least] 3 layers: 
# 		1. embedding, 
#  		2. encoder [x N_LAYERS], 
# 		3. predictor

def __init__(self, hidden_size, embedding_dim, vocab_size, n_lstm_layers, n_classes, bidirectional):  # bespoke @ANDY:@DEBUG:@1818 this may be causing a bug, below is default
#def __init__(self, hidden_size, embedding_dim, vocab_size):  # ^
	super(LSTMClassifier, self).__init__()

	#self.embedding 	= nn.Embedding(vocab_size, embedding_dim) # @ANDY:@DEBUG:@1818 this leads to error: "RuntimeError: index out of range: Tried to access index 20000 out of table with 19999 rows. at /opt/conda/conda-bld/pytorch_1579022060824/work/aten/src/TH/generic/THTensorEvenMoreMath.cpp:418", so a crude fix is the line below
	self.embedding 	= nn.Embedding(vocab_size +2, embedding_dim) # see ^
	self.encoder 	= nn.LSTM( 	input_size  = embedding_dim, 
								hidden_size = hidden_size,
								num_layers  = n_lstm_layers)
								#bidirectional = bidirectional) # @20201120:@latest:@nownow: we get input dim 2 error due to bidirectional!   # @ANDY:@DEBUG:@1818 this leads to error: "RuntimeError: index out of range: Tried to access index 20000 out of table with 19999 rows. at /opt/conda/conda-bld/pytorch_1579022060824/work/aten/src/TH/generic/THTensorEvenMoreMath.cpp:418", so a crude fix is the line below
	#self.predictor 	= nn.Linear(hidden_size, N_OUT_CLASSES )  # bespoke @ANDY:@DEBUG:@1818 this may be causing a bug, below is default // arg1 = size of input, arg2 = number of output classes, see: https://pytorch.org/docs/stable/nn.html (CTRL+F: "nn.Linear")
	# @20201120 - the book doesnt explain why below has arg2 = 2, this is meant to be the size of the output according to: https://pytorch.org/docs/stable/generated/torch.nn.Linear.html
	self.predictor 	= nn.Linear(hidden_size, 10 )  # ^ // arg1 = size of input, arg2 = number of output classes, see: https://pytorch.org/docs/stable/nn.html (CTRL+F: "nn.Linear")
	#self.predictor 	= nn.Linear(hidden_size, n_classes )  # ^ // arg1 = size of input, arg2 = number of output classes, see: https://pytorch.org/docs/stable/nn.html (CTRL+F: "nn.Linear")

	#self.flatten_parameters()

# This is how the model makes predictions, 
# .. given an input (training: u/ later to calculate losses & backprops )
def forward( self, seq ):

	try:
		output, (hidden,_) 	= self.encoder(self.embedding(seq))
		# @20201120 - below is before, and the not commented one below that is how it is according ot the book
		#preds 				= self.predictor(hidden[-1].squeeze(0))  # e.g. remove 1D entries from the shape of an array, see: https://docs.scipy.org/doc/numpy/reference/generated/numpy.squeeze.html
		
		#pdb.set_trace() ## @latest:@now
		preds 				= self.predictor(hidden.squeeze(0))  # e.g. remove 1D entries from the shape of an array, see: https://docs.scipy.org/doc/numpy/reference/generated/numpy.squeeze.html

	except Exception as ex:
		print(ex)
		[max(seq[i]) for i in range(seq.shape[0])]
		pdb.set_trace()

	return preds

Code of the training:

def training( epochs, model, optimizer, criterion, train_iterator, valid_iterator ):

“”"

USAGE:
MODEL = LSTMClassifier( HIDDEN_SIZE, INPUT_SIZE, VOCAB_SIZE )
OPTIMIZER = optim.Adam(model.parameters(), lr=2e-2) # for e.g. of model, see: “class LSTMClassifier”, lr is learning_rate
CRITERION = nn.CrossEntropyLoss() # loss calculation choice, for strictly binary try CTRL+F: “BCE on page 83 of PyTorch for Deep Learning - O’Reilly”
train( N_EPOCHS, MODEL, OPTIMIZER, CRITERION, train_iterator, valid_iterator )

“”"

for epoch in range(1, epochs+1):

  training_loss 	= 0.0 # restart
  valid_loss 		= 0.0 # ^

  #
  # Training set learning
  #
  model.train() 	# recursion? or is this a method within LSTMClassifier

  #pdb.set_trace()

  for batch_idx, batch in enumerate(train_iterator):

  	#pdb.set_trace()

  	#model.zero_grad() 						# 1. restart [the gradients (?)]

  	optimizer.zero_grad() 						# 1. restart [the gradients (?)]

  	predict = model( batch.sequence )

  	# if len(predict.shape)==2: # if num_layers = 1, then we wrap it in a list else, later we get slicing errors
  	# 	#print("Layers is 1")
  	# 	predict = [model( batch.sequence )] 			# 2. predict [the output labels]
  	# 	#pdb.set_trace()
  	# else:
  	# 	pass
  	
  	#pdb.set_trace()

  	####################

  	## machine test layer 0 

  	#pdb.set_trace() ## @20201120 - debugging - expected input batch+size (2) to match target batch_size (512) @LATEST

  	loss = criterion( predict, batch.label )# 3. loss [check mistaken predictions via criterion]

  	# }} // {{ 

  	# ## machine test layer 1
  	# loss = criterion( predict[-1], batch.label )# 3. loss [check mistaken predictions via criterion]

  	####################

  	loss.backward() 						# 4. backward [backpropagate to learn from mistakes]
  	optimizer.step() 						# 5. optimise [the gradient changes to given learning rate]

  	training_loss += loss.data.item() * batch.sequence.size(0) # accumulate losses for summary prints

  training_loss /= len(train_iterator) # account for the size of dataset

  model.eval() # @TODO: wtf?

  #
  # Validation set learning
  #
  correct = 0
  total 	= 0
  for batch_idx, batch in enumerate(valid_iterator):

  	predict = model( batch.sequence ) 				# 1. prediction
  	
  	# if len(predict.shape)==2: # if num_layers = 1, then we wrap it in a list else, later we get slicing errors
  	# 	#print("Layers is 1")
  	# 	predict = [model( batch.sequence )] 			# 2. predict [the output labels]
  	# 	#pdb.set_trace()
  	# else:
  	# 	pass


  	####################

  	## machine layer 0 - when num_layers = 2
  	loss = criterion( predict, batch.label ) 	# 2. loss 

  	# }} // {{

  	# ## machine layer n - when num_layers = 2
  	# loss = criterion( predict[-1], batch.label ) 	# 2. loss 
  
  	####################

  	valid_loss += loss.data.item() * batch.sequence.size(0) 	#  accumulate losses for validation set

  	## neater
  	# accuracy updates

  	####################

  	## machine test layer 0 - when num_layers = 2
  	preds 	= predict.data.max(1, keepdim=True)[1]  # getting all predicted output classes for entire batch as a vector

  	# }} // {{ 

  	# ## machine test layer n - when num_layers = 2
  	# preds 	= predict[-1].data.max(1, keepdim=True)[1]  # getting all predicted output classes for entire batch as a vector


  	pdb.set_trace() 
  	####################


  	# ## default - when num_layers = 1 
  	# preds 	= predict.data.max(1, keepdim=True)[1]  # getting all predicted output classes for entire batch as a vector
  	# correct += float(preds.eq(batch.label.data.view_as(preds)).cpu().sum()) # compare predictions to actual validation set class outputs, and sums the correct answers
  	
  	#######################

  	## machine test layer 0  - when num_layers = 2
  	# accuracy updates
  	correct += float(predict.data.max(1, keepdim=True)[1].eq(batch.label.data.view_as(predict.data.max(1, keepdim=True)[1])).cpu().sum())

  	# }} // {{ 

  	# ## machine test layer n  - when num_layers = 2
  	# # accuracy updates
  	# correct += float(predict[-1].data.max(1, keepdim=True)[1].eq(batch.label.data.view_as(predict[-1].data.max(1, keepdim=True)[1])).cpu().sum())

  	#######################			

  	total += len(preds)

  valid_loss /= len(valid_iterator)
  accuracy = (correct/total)*100.0

  #
  # Validation set learning
  #
  print("Epoch: {}, Training Loss: {:.2f}, Validation Loss: {:.2f}, Accuracy: {:.2f}%".format( 	epoch, 
  																			training_loss, 
  																			valid_loss,
  																			accuracy) )

Code of the train-test-valid Bucket Iterator

Deterministic Results - for testing & reproducibility

seed = 1
torch.cuda.manual_seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic=True
np.random.RandomState(seed)

1. getting our data - treating then saving again

print(“Reading pre-processed raw data…”)
d = pd.read_csv( “./data/”+d_filename+“.csv”, header = None, engine = “python”)

print(“Preview of raw data…”)
print(d.head(5)) # show some data

print(“Cleaning & processing raw data…”)

@20201120 - have to slice a different column now, since we have column 0 = pid, default is the commented out line

d[“log2foldexpression_cat”] = d[1].astype(“category”) # create new column, for the output “sentiment” class

d[“log2foldexpression_cat”] = d[2].astype(“category”) # create new column, for the output “sentiment” class
d[“output_class”] = d[“log2foldexpression_cat”].cat.codes # yet a new column, as binary 0 vs. 1

print(“Saving processed data, and generating separate smaller sub-sample for testing…”)
device_suffix = DEVICE.replace(“:”,“_”)

d_path = “./data/”+d_filename+“_”+device_suffix+“_processed.csv”
d.to_csv( d_path, header = None, index = None ) # save to new file
d.sample(1000).to_csv( “./data/train-processed-sample.csv”, header = None, index = None ) # save small sample for testing on

Code of the (need help the most) generating error rate file

with open(“error_rate_20201120_pids_included_v2.dat”, “w”) as fo: # uncommented below, so that a different error.dat file for testing
#with open(“error_rate.dat”, “w”) as fo:

fo.write("Pid\tPredicted\tActual\n")

lstm_classifier.eval()

correct = 0
total 	= 0

for batch_idx, batch in enumerate(valid_iterator):

	predict = lstm_classifier( batch.sequence ) # @added .T
	#predict = model( batch.sequence ) 				# 1. prediction
	
	# if len(predict.shape)==2: # if num_layers = 1, then we wrap it in a list else, later we get slicing errors
	# 	#print("Layers is 1")
	# 	predict = [model( batch.sequence )] 			# 2. predict [the output labels]
	# 	#pdb.set_trace()
	# else:
	# 	pass

	####################

	## machine layer 0 - when num_layers = 2
	#loss = criterion( predict, batch.label ) 	# 2. loss 

	# }} // {{

	# ## machine layer n - when num_layers = 2
	# loss = criterion( predict[-1], batch.label ) 	# 2. loss 

	####################

	#valid_loss += loss.data.item() * batch.sequence.size(0) 	#  accumulate losses for validation set

	## neater
	# accuracy updates

	####################

	## machine test layer 0 - when num_layers = 2
	preds 	= predict.data.max(1, keepdim=True)[1]  # getting all predicted output classes for entire batch as a vector

	# }} // {{ 

	# ## machine test layer n - when num_layers = 2
	# preds 	= predict[-1].data.max(1, keepdim=True)[1]  # getting all predicted output classes for entire batch as a vector

	####################


	# ## default - when num_layers = 1 
	# preds 	= predict.data.max(1, keepdim=True)[1]  # getting all predicted output classes for entire batch as a vector
	# correct += float(preds.eq(batch.label.data.view_as(preds)).cpu().sum()) # compare predictions to actual validation set class outputs, and sums the correct answers
	
	#######################

	## machine test layer 0  - when num_layers = 2
	# accuracy updates
	correct += float(predict.data.max(1, keepdim=True)[1].eq(batch.label.data.view_as(predict.data.max(1, keepdim=True)[1])).cpu().sum())

	# }} // {{ 

	labels = batch.label.data.view_as(predict.data.max(1, keepdim=True)[1])

	# @20201120 - @RawField testing if we can add pids to the error_data.dst file
	#pids = batch.pid.data.view_as(predict.data.max(1, keepdim=True)[1])
	try:
		# @20201120 - @RawFiled again the below one fails, so we just hope to god that keepings pids as a list will map pid to correct label/sequence uncommented below is @default
		# pids = batch.pid.view_as(predict.data.max(1, keepdim=True)[1]) 
		pids = batch.pid
	except:
		print("pids")
		pdb.set_trace() # @LATEST

	batch.label.data.view_as(predict.data.max(1, keepdim=True)[1])

	for i2, pred in enumerate(preds):

		# @LATEST
		# @20201120:0155hrs the below failed, due to pids being a string and not tensor, so we try to just treat it as a list -- ctrlf:"o we just hope to god"
		fo.write(str(pids[i2])+"\t"+str(pred.data.cpu().numpy()[0])+"\t"+str(labels[i2].data.cpu().numpy()[0])+"\n") 
		# @20201120: failed  // uncommented the default below, to add pids @20201120 - testing @RawField
		#fo.write(str(pids[i2].data.cpu().numpy()[0])+"\t"+str(pred.data.cpu().numpy()[0])+"\t"+str(labels[i2].data.cpu().numpy()[0])+"\n") 
		#fo.write(str(pred.data.cpu().numpy()[0])+"\t"+str(labels[i2].data.cpu().numpy()[0])+"\n")  # @default original OldPeteV12


	# ## machine test layer n  - when num_layers = 2
	# # accuracy updates
	# correct += float(predict[-1].data.max(1, keepdim=True)[1].eq(batch.label.data.view_as(predict[-1].data.max(1, keepdim=True)[1])).cpu().sum())

	#######################			

	total += len(preds)

#valid_loss /= len(valid_iterator)
accuracy = (correct/total)*100.0

2. Defining fields - torchtext allows us to choose input and output data from the dataset

print(“Defining fields for the dataloader…”)

LABEL = data.LabelField() # torchtext.data.labelField() by default sets sequential (text) to False, since it is a numerical output: 0, 1, 2, etc.

INPUT = data.Field( tokenize=“spacy”, lower=True ) # Linguistic Features · spaCy Usage Documentation

import revtok

INPUT = data.Field( tokenize=“revtok”, lower=True ) # Linguistic Features · spaCy Usage Documentation

added: (“pid”, PID) RawField so e.g. AT1G66370.p can be identified in error.dat file

PID = data.RawField()

fields = [ (“pid”, PID), (“sequence”, INPUT), (“log2foldexpression”, None), (“log2foldexpression_cat”, None), (“label”, LABEL) ] # map LABEL and INPUT into the CSV rows

processed_dataset = torchtext.data.TabularDataset(
path = d_path, # full 1.6 mil INPUTs
#path = “./data/train-processed-sample.csv”, # 10k sample
format=“CSV”,
fields=fields,
skip_header=False)

3. Split into test, train and validation sets

print(“Splitting data into: training set, validation set and test set…”)

(train, test, valid) = processed_dataset.split(split_ratio = [0.8, 0.1, 0.1]) # proportions of each

LABEL.build_vocab(train) # see below: to debug: “AttributeError: ‘LabelField’ object has no attribute ‘vocab’”

#PID.build_vocab(train) # @20201120 we rturned RawField into Field and then build_vocab, since it will otherwise not be a tensor, which confuses downstream

print( len(train), len(test), len(valid) ) # check they look like the right ratios

TEST 1:

if len(train) + len(test) + len(valid) == len(processed_dataset):
print(“TEST 1 PASSED!”)

print(“Preview of examples of the training set…”)

interesting examples from: path = “./data/train-processed-sample.csv”, replicate these by uncommenting the line matching: CTRL+F: “10k sample”

print(vars(train.examples[121])) # e.g. TP: predicted sentiment: “happy”, clearly it is!
print(vars(train.examples[1010])) # e.g. FP: predicted sentiment: “happy”, but is it really happy?
print(vars(train.examples[1])) # e.g. TN: predicted sentiment: “sad”, clearly it is!
print(vars(train.examples[1901])) # e.g. FN: predicted sentiment: “sad”, but is it really sad?

print("Maximum number of k-mers for a given promoter is: ",max([len(vars(train.examples[i])[‘sequence’]) for i in range(len(train))]))

input_guess = max([len(vars(train.examples[i])[‘sequence’]) for i in range(len(train))])

4. Building a vocabulary - embedding layer is now added // one-hot-encoder (bad)

print(“Building vocabulary…”)

vocab_size = VOCAB_SIZE # to restrict the vocabulary, which saves memory
INPUT.build_vocab(train, max_size = vocab_size)
#INPUT.build_vocab(train)

print(len(INPUT.vocab)) # check how big our vocabulary is, it will always add two additional words:
# … for unknown, and …
# … a padding token u/ to pad our text to roughly the same length.
# NOTE: eos_token and init_token symbols can be specified, which are not default.

print("Total number of unique k-mer words is: ",len(INPUT.vocab.freqs.keys()))

print(“Most frequent k-mer words…”)
print(INPUT.vocab.freqs.most_common(10)) # 10 most common words

5. Build the Batch DataLoader

print(“Generating batches of data to iterate over training…”)

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits( (train, valid, test), batch_size = BATCH_SIZE, device = DEVICE, sort=False) # @TODO:dataparallel to work with 2 x GPUs

train_iterator, test_iterator, valid_iterator = data.BucketIterator.splits( (train, test, valid), batch_size = BATCH_SIZE, device = DEVICE) # @TODO:dataparallel to work with 2 x GPUs

Andrew_Brockman · November 20, 2020, 10:40pm

Many thanks @Abhilash_Srivastava! I hope this reply answers your question: there is one output class out of a possible 10 output classes, per tokenized sentence of variable length DNA “words” Cannot adapt seq-to-binary lstm classifier into 10x classifier due to: ValueError: Expected input batch_size (2) to match target batch_size (512)