Hello,
I have been stuck at a strange problem since two days. I am working on a retrieval-based chatbot, and to this end I am training a binary classification model where the input is a pair of sentences (context, response)
, and the output is a label, 1
if response
is the correct response to context
, and 0
otherwise.
The model I’m using is a dual encoder, defined as follows.
class Encoder(nn.Module):
""" Encoder class """
def __init__(self, glove, emb_size, hidden_size, word2id, p_dropout):
super(Encoder, self).__init__()
self.emb_size = emb_size
self.hidden_size = hidden_size
self.vocab_size = len(word2id.keys())
self.p_dropout = p_dropout
self.word2id = word2id # dictionary mapping word to id
self.glove = glove # glove class of embeddings
self.embedding = nn.Embedding(self.vocab_size, self.emb_size)
self.lstm = nn.LSTM(self.emb_size, self.hidden_size, batch_first=True)
self.dropout_layer = nn.Dropout(self.p_dropout)
self.init_weights()
def init_weights(self):
init.uniform_(self.lstm.weight_ih_l0, a=-0.01, b=0.01)
init.orthogonal_(self.lstm.weight_hh_l0)
self.lstm.weight_ih_l0.requires_grad = True
self.lstm.weight_hh_l0.requires_grad = True
# Initialize the embedding weights
embedding_weights = torch.FloatTensor(self.vocab_size, self.emb_size)
for _, word in enumerate(self.word2id.keys()):
embedding_weights[self.word2id[word]] = torch.FloatTensor(self.glove.word2vec.get(word, np.zeros(self.emb_size)))
self.embedding.weight = nn.Parameter(embedding_weights, requires_grad = True)
def forward(self, inputs):
embeddings = self.embedding(inputs)
_, (last_hidden, _) = self.lstm(embeddings) # dim: (num_layers * num_directions x batch_size x hidden_size)
last_hidden = self.dropout_layer(last_hidden[-1])# last lstm layer, dim: (batch_size x hidden_size)
return last_hidden
class DualEncoder(nn.Module):
""" DualEncoder class """
def __init__(self, encoder):
super(DualEncoder, self).__init__()
self.encoder = encoder
self.hidden_size = self.encoder.hidden_size
M = torch.FloatTensor(self.hidden_size, self.hidden_size)
init.xavier_normal_(M)
self.M = nn.Parameter(M, requires_grad = True)
def forward(self, context_tensor, response_tensor):
context_last_hidden = self.encoder(context_tensor) # dim: (batch_size x hidden_size)
response_last_hidden = self.encoder(response_tensor) # dim: (batch_size x hidden_size)
context = context_last_hidden.mm(self.M) # dim: (batch_size x hidden_size)
context = context.view(-1, 1, self.hidden_size) # dim: (batch_size x 1 x hidden_size)
response = response_last_hidden.view(-1, self.hidden_size, 1) # dim: (batch_size x hidden_size x 1)
score = torch.bmm(context, response).view(-1, 1) # dim: (batch_size x 1 x 1)
return score
I defined a DataLoader with a WeightedRandomSampler
so that each batch contains roughly the same number of positive and negative samples. The training code is given at the end of the post.
There is something very wrong in the code: the model doesn’t train at all! The loss didn’t decrease, on small or large datasets. I have spent a lot of time on this but couldn’t find the mistake
Thank you very much in advance for your kind help!
encoder = Encoder(glove=glove,
emb_size=100,
hidden_size=50,
word2id=word2id,
p_dropout=0.85).to(device)
model = DualEncoder(encoder).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 50
train_accuracies = []
train_losses = []
val_accuracies = []
val_losses = []
for epoch in range(epochs):
start = time.time()
# Set model to training mode
model.train()
# Compute running_loss and running_corrects
running_loss = 0.0
running_corrects = 0
epoch_pos = 0
epoch_neg = 0
# Iterate over data
for _, context_tensor, response_tensor, label in train_loader:
label_pos = label.sum().item()
label_neg = label.shape[0] - label_pos
epoch_pos += label_pos
epoch_neg += label_neg
context_tensor = context_tensor.to(device)
response_tensor = response_tensor.to(device)
label = label.to(device)
# zero the parameter gradients
optimizer.zero_grad()
with torch.set_grad_enabled(True):
score = model(context_tensor, response_tensor)
label = label.view(-1, 1)
loss = criterion(score, label.float())
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item()
prediction = (torch.sigmoid(score) >= 0.5).type(torch.LongTensor)
c = (prediction == label.cpu())
running_corrects += c.sum().item()
idx += 1
# Compute epoch_loss and epoch_acc
epoch_loss = running_loss / (epoch_pos + epoch_neg)
epoch_acc = running_corrects / (epoch_pos + epoch_neg)
train_accuracies.append(epoch_acc)
train_losses.append(epoch_loss)
print('Epoch {} -- Training Loss: {:.4f} -- Training Accuracy: {:.4f} -- Training time: {:.4f} (m)' .format(
epoch+1, epoch_loss, epoch_acc, (time.time() - start)/60))