Attention BiLSTM CrossEntropy loss function shape mismatch

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F


class Atten_BiLSTM(nn.Module):
    """
    Code source:
    https://github.com/littleflow3r/attention-bilstm-for-relation-classification/blob/master/model.py
    """
    def __init__(self, hyperparameters):
        super(Atten_BiLSTM, self).__init__()
        self.hidden_dim = hyperparameters.hidden_dim
        self.batch_size = hyperparameters.batch_size
        self.emb_dim = hyperparameters.embedding_dim
        self.gpu = 'cuda' if torch.cuda.is_available() else 'cpu'

        # vocab_size 456, embedding_dim 300
        self.embedding = nn.Embedding(hyperparameters.vocab_size, hyperparameters.embedding_dim) 
        # embedding_dim 300, hidden_dim 256, bidirectional True
        self.encoder = nn.LSTM(hyperparameters.embedding_dim,
                               hyperparameters.hidden_dim,
                               bidirectional=hyperparameters.bidirectional)
        # hidden dim 256, num_classes 4
        self.fc = nn.Linear(hyperparameters.hidden_dim, hyperparameters.num_classes) 
        # dropout 0.2
        self.dropout = nn.Dropout(hyperparameters.dropout) 
    
    def attnetwork(self, encoder_out, final_hidden):
        hidden = final_hidden.squeeze(0)
        attn_weights = torch.bmm(encoder_out, hidden.unsqueeze(2)).squeeze(2)
        soft_attn_weights = F.softmax(attn_weights, 1)
        new_hidden = torch.bmm(encoder_out.transpose(1,2), soft_attn_weights.unsqueeze(2)).squeeze(2)
        return new_hidden
    
    def forward(self, sequence):
        emb_input = self.embedding(sequence)    
        inputx = self.dropout(emb_input)
        output, (hn, cn) = self.encoder(inputx)
        fbout = output[:, :, :self.hidden_dim] + output[:, :, self.hidden_dim:]
        fbout = fbout.permute(1,0,2).to('cuda')
        fbhn = (hn[-2,:,:] + hn[-1,:,:]).unsqueeze(0)
        attn_out = self.attnetwork(fbout, fbhn)
        logits = self.fc(attn_out)
        return logits
    
    def train_(self, optimizer, loss_fn, train_dataset, epochs=2):
        train_loss = 0.
        for _ in tqdm(range(epochs)):
            self.train()
            epoch_loss = 0.0
            for _, samples in enumerate(train_dataset):
                inputs, labels = samples['inputs'], samples['outputs']
                optimizer.zero_grad()
                # inputs shape [128, 256] = [batch size, max seq len]
                predictions = self(inputs)
                # print(predictions.shape)  # torch.Size([256, 4])
                # print(labels.shape)  # torch.Size([128, 256])
                loss = loss_fn(predictions, labels)
                loss.backward()
                optimizer.step()
            train_loss += loss.tolist()
        return train_loss

atten_model = Atten_BiLSTM(hyperparams).to('cuda')

train_dataset_ = DataLoader(train_dataset, batch_size=hyperparams.batch_size, shuffle=True)

lossf = nn.CrossEntropyLoss(ignore_index=train_dataset.label2idx['<PAD>'])
opt = optim.Adam(atten_model.parameters(), lr=1e-6)
atten_model.train_(optimizer=opt, loss_fn=lossf, train_dataset=train_dataset_, epochs=2)

ERROR
ValueError: Expected input batch_size (256) to match target batch_size (128). on the line loss = loss_fn(predictions, labels)

Since your target batch size us 128, and your initial batch size is 128 according to one of your comments

# inputs shape [128, 256] = [batch size, max seq len]

somewhere you make a mistake with the dimensions. My first guess would be that you define your nn.LSTM with ‘batch_first=False’ (default value). If you give the LSTM layer a shape of [128, 256], 128 is the sequence length and 256 the batch size. So you either have to define your LSTM with batch_first=True or add

inputx = inputx.transpose(1,0)

before giving it to the LSTM. Of course, these change might affect subsequent layers which you then also have to modify.

1 Like