Unable to overfit a few training samples in a binary classification problem

Hello,

I have been stuck at a strange problem since two days. I am working on a retrieval-based chatbot, and to this end I am training a binary classification model where the input is a pair of sentences (context, response), and the output is a label, 1 if response is the correct response to context, and 0 otherwise.

The model I’m using is a dual encoder, defined as follows.

class Encoder(nn.Module):
    """ Encoder class """
    def __init__(self, glove, emb_size, hidden_size, word2id, p_dropout): 
        super(Encoder, self).__init__()
        self.emb_size = emb_size
        self.hidden_size = hidden_size 
        self.vocab_size = len(word2id.keys())
        self.p_dropout = p_dropout 
        self.word2id = word2id # dictionary mapping word to id
        self.glove = glove # glove class of embeddings 
        self.embedding = nn.Embedding(self.vocab_size, self.emb_size)
        self.lstm = nn.LSTM(self.emb_size, self.hidden_size, batch_first=True)
        self.dropout_layer = nn.Dropout(self.p_dropout)

        self.init_weights()
             
    def init_weights(self):
        init.uniform_(self.lstm.weight_ih_l0, a=-0.01, b=0.01)
        init.orthogonal_(self.lstm.weight_hh_l0)

        self.lstm.weight_ih_l0.requires_grad = True
        self.lstm.weight_hh_l0.requires_grad = True
        
        # Initialize the embedding weights
        embedding_weights = torch.FloatTensor(self.vocab_size, self.emb_size)
            
        for _, word in enumerate(self.word2id.keys()):
            embedding_weights[self.word2id[word]] = torch.FloatTensor(self.glove.word2vec.get(word, np.zeros(self.emb_size)))
        
        self.embedding.weight = nn.Parameter(embedding_weights, requires_grad = True)
            
    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        _, (last_hidden, _) = self.lstm(embeddings) # dim: (num_layers * num_directions x batch_size x hidden_size)
        last_hidden = self.dropout_layer(last_hidden[-1])# last lstm layer, dim: (batch_size x hidden_size)
        return last_hidden


class DualEncoder(nn.Module):
    """ DualEncoder class """
    def __init__(self, encoder):
        super(DualEncoder, self).__init__()
        self.encoder = encoder
        self.hidden_size = self.encoder.hidden_size
        M = torch.FloatTensor(self.hidden_size, self.hidden_size)     
        init.xavier_normal_(M)
        self.M = nn.Parameter(M, requires_grad = True)

    def forward(self, context_tensor, response_tensor):
        context_last_hidden = self.encoder(context_tensor) # dim: (batch_size x hidden_size)
        response_last_hidden = self.encoder(response_tensor) # dim: (batch_size x hidden_size)
        context = context_last_hidden.mm(self.M) # dim: (batch_size x hidden_size)
        context = context.view(-1, 1, self.hidden_size) # dim: (batch_size x 1 x hidden_size)
        response = response_last_hidden.view(-1, self.hidden_size, 1) # dim: (batch_size x hidden_size x 1)
        score = torch.bmm(context, response).view(-1, 1) # dim: (batch_size x 1 x 1)

        return score

I defined a DataLoader with a WeightedRandomSampler so that each batch contains roughly the same number of positive and negative samples. The training code is given at the end of the post.

There is something very wrong in the code: the model doesn’t train at all! The loss didn’t decrease, on small or large datasets. I have spent a lot of time on this but couldn’t find the mistake :frowning:

Thank you very much in advance for your kind help!

encoder = Encoder(glove=glove,
                emb_size=100,
                hidden_size=50,
                word2id=word2id,
                p_dropout=0.85).to(device)

model = DualEncoder(encoder).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 50
train_accuracies = []
train_losses = []
val_accuracies = []
val_losses = []

for epoch in range(epochs):
    start = time.time()

    # Set model to training mode
    model.train()

    # Compute running_loss and running_corrects
    running_loss = 0.0
    running_corrects = 0
    epoch_pos = 0
    epoch_neg = 0

    # Iterate over data
    for _, context_tensor, response_tensor, label in train_loader:
        label_pos = label.sum().item()
        label_neg = label.shape[0] - label_pos
        epoch_pos += label_pos
        epoch_neg += label_neg

        context_tensor = context_tensor.to(device)
        response_tensor = response_tensor.to(device)
        label = label.to(device)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        with torch.set_grad_enabled(True):
            score = model(context_tensor, response_tensor)
            label = label.view(-1, 1)
            loss = criterion(score, label.float())
            loss.backward()
            optimizer.step()

            # statistics
            running_loss += loss.item()
            prediction = (torch.sigmoid(score) >= 0.5).type(torch.LongTensor)
            c = (prediction == label.cpu())
            running_corrects += c.sum().item()

        idx += 1

    # Compute epoch_loss and epoch_acc
    epoch_loss = running_loss / (epoch_pos + epoch_neg)
    epoch_acc = running_corrects / (epoch_pos + epoch_neg)

    train_accuracies.append(epoch_acc)
    train_losses.append(epoch_loss)

    print('Epoch {} -- Training Loss: {:.4f} -- Training Accuracy: {:.4f} -- Training time: {:.4f} (m)' .format(
            epoch+1, epoch_loss, epoch_acc, (time.time() - start)/60))

I have found that the problem was caused by two things:

  • WeightedRandomSampler.
  • Dropout.

If I use either of them, or both, it is very difficult, even impossible, to fit the data. I would appreciate a lot if somebody could refer me to some references that discuss this kind of problems. Thanks.