My code is running fine but the model is not training regardless of the hyperparameters I use. I am not sure where it is going wrong. I am facing the same issue with a different model while using CTC Loss. Am I not understanding CTCLOss() correctly?
Here is my model:
class Bi_RNN(nn.Module):
def __init__(self, input_dim1, input_dim2, hidden_dim, batch_size, output_dim=35, num_layers=2, rnn_type='LSTM'):
super(Bi_RNN, self).__init__()
self.input_dim1 = input_dim1
self.input_dim2 = input_dim2
self.hidden_dim = hidden_dim
self.batch_size = batch_size
self.num_layers = num_layers
#Define the initial linear hidden layer
self.init_linear1 = nn.Linear(self.input_dim1, self.input_dim1)
self.init_linear2 = nn.Linear(self.input_dim2, self.input_dim2)
# Define the LSTM layer
self.lstm1 = eval('nn.' + rnn_type)(self.input_dim1, self.hidden_dim, self.num_layers, batch_first=True, bidirectional=True)
self.lstm2 = eval('nn.' + rnn_type)(self.input_dim2, self.hidden_dim, self.num_layers, batch_first=True, bidirectional=True)
self.lstm3 = eval('nn.' + rnn_type)(self.hidden_dim *2 *2, self.hidden_dim, self.num_layers, batch_first=True, bidirectional=True)
# Define the output layer
self.linear = nn.Linear(self.hidden_dim * 2 , output_dim)
self.log_softmax = nn.LogSoftmax(dim=1)
def init_hidden(self):
# This is what we'll initialise our hidden state as
return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim),
torch.zeros(self.num_layers, self.batch_size, self.hidden_dim))
def forward(self, input1, input2):
#Forward pass through initial hidden layer
linear_input1 = self.init_linear1(input1)
linear_input2 = self.init_linear2(input2)
# Forward pass through LSTM layer
# shape of lstm_out: [batch_size, input_size ,hidden_dim]
# shape of self.hidden: (a, b), where a and b both
# have shape (batch_size, num_layers, hidden_dim).
lstm_out1, self.hidden1 = self.lstm1(linear_input1)
lstm_out2, self.hidden2 = self.lstm2(linear_input2)
# Can pass on the entirety of lstm_out to the next layer if it is a seq2seq prediction
lstm_out3 = torch.cat((lstm_out1, lstm_out2), 2)
lstm_out, self.hidden3 = self.lstm3(lstm_out3)
y_pred = self.log_softmax(self.linear(lstm_out))
return y_pred
I am using Adam Optimizer and CTC Loss
model = Bi_RNN(input_dim1, input_dim2, hidden_dim, num_layers, num_classes).to(device)
criterion = nn.CTCLoss(blank=0, reduction='mean')
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
This is my training loop
for e in tqdm(range(1, EPOCHS+1)):
# TRAINING
train_epoch_loss = 0
train_epoch_acc = 0
model.train()
for X1_train_batch, X2_train_batch, y_train_batch, X_train_lens, y_train_lens in train_loader:
X1_train_batch, X2_train_batch, y_train_batch, X_train_lens, y_train_lens = X1_train_batch.to(device), X2_train_batch.to(device), y_train_batch.to(device), X_train_lens.to(device), y_train_lens.to(device)
y_train_pred = model(X1_train_batch, X2_train_batch)
y_train_batch = torch.squeeze(y_train_batch)
T = MAX_SEQ_LEN
N = BATCH_SIZE
C = num_classes
pred_len = torch.full((N,), T, dtype=torch.long)
y_train_pred_trans = y_train_pred.permute(1, 0, 2)
train_loss = criterion(y_train_pred_trans, y_train_batch, pred_len, y_train_lens)
train_acc = multi_acc(y_train_pred, y_train_batch, y_train_lens)
optimizer.zero_grad()
train_loss.backward()
optimizer.step()
train_epoch_loss += train_loss.item()
train_epoch_acc += train_acc.item()