ValueError: Expected input batch_size (128) to match target batch_size (50)

I am doing Semantic Role Labeling, trying to implement Attention BiLSTM
Batch Size = 128

class BaselineModel(nn.Module):
    def __init__(self, hparams):
        super(BaselineModel, self).__init__()
        self.name = hparams.model_name
        self.hidden_dim = hparams.hidden_dim

        # vocab_size = 27308, embeddings_dim = 300
        self.word_embedding = nn.Embedding(hparams.vocab_size, hparams.embedding_dim)

        # hidden_dim = 256, bidirectional = True, num_layers = 2, dropout = 0.4 
        self.lstm = nn.LSTM(hparams.embedding_dim, hparams.hidden_dim,
                            bidirectional=hparams.bidirectional,
                            num_layers=hparams.num_layers,
                            dropout=hparams.dropout)

        self.dropout = nn.Dropout(hparams.dropout)

        # hidden_dim = 256, num_classes = 35 
        self.classifier = nn.Linear(hparams.hidden_dim, hparams.num_classes)

    def attnetwork(self, encoder_out, final_hidden):
        hidden = final_hidden.squeeze(0)
        attn_weights = F.softmax(torch.bmm(encoder_out, hidden.unsqueeze(2)).squeeze(2), 1)
        return torch.bmm(encoder_out.transpose(1, 2), attn_weights.unsqueeze(2)).squeeze(2)

    def forward(self, x):
        embeddings = self.word_embedding(x)
        embeddings = self.dropout(embeddings)
        output, (hidden_state, _) = self.lstm(embeddings)

        encoder_out = output[:, :, :self.hidden_dim] + output[:, :, self.hidden_dim:]
        encoder_out = encoder_out.permute(1, 0, 2)
        encoder_hidden = (hidden_state[-2, :, :] + hidden_state[-1, :, :]).unsqueeze(0)
        attention_output = self.attnetwork(encoder_out, encoder_hidden)
        logits = self.classifier(attention_output)
        return logits

My training procedure produces the following error expected input batch size 128 to match target batch size 50

    def train(self, train_dataset, valid_dataset, epochs):
        train_loss = 0.0
        for epoch in range(1, epochs + 1):
            epoch_loss = 0.0
            self.model.train()
            for step, sample in enumerate(train_dataset):   # using dataloader
                inputs, labels = sample[0], sample[1]
                # print(inputs.shape)  # [50, 128]
                # print(labels.shape)  # [50, 128]
                self.optimizer.zero_grad()
                predictions_ = self.model(inputs)
                # predictions_.shape = [128, 35], labels.shape =  [50, 128]
                sample_loss = self.loss_function(predictions_, labels) # ERROR OCCURS HERE
                sample_loss.backward()
                clip_grad_norm_(self.model.parameters(), 5.)  # Gradient Clipping
                self.optimizer.step()
                epoch_loss += sample_loss.tolist()
            avg_epoch_loss = epoch_loss / len(train_dataset)
            train_loss += avg_epoch_loss
            valid_loss = self.evaluate(valid_dataset)
            if self._verbose > 0:
                print(f'Epoch {epoch}: [loss = {avg_epoch_loss:0.4f},  val_loss = {valid_loss:0.4f}]')

Working on Colab with torch 1.5.0

Could you post the shape of embeddings before feeding it to self.lstm?
It seems that you are not permuting the input to this layer and don’t set batch_first=True, which might yield a wrong output and this error.

@ptrblck I removed batch_first=True and here are all the shapes

    def forward(self, x):
        embeddings = self.word_embedding(x)
        embeddings = self.dropout(embeddings)
        # print(f"Embeddings Shape: {embeddings.shape}")  # torch.Size([50, 128, 300])
        output, (hidden_state, _) = self.lstm(embeddings)
        # print(f"Output Shape: {embeddings.shape}")  # torch.Size([50, 128, 300])
        # print(f"Hidden_state Shape: {hidden_state.shape}")  torch.Size([4, 128, 256])
        encoder_out = output[:, :, :self.hidden_dim] + output[:, :, self.hidden_dim:]
        # print(f"encoder_out Shape: {encoder_out.shape}")  # torch.Size([50, 128, 256])
        encoder_out = encoder_out.permute(1, 0, 2)
        # print(f"encoder_out Shape after premute: {encoder_out.shape}")  # torch.Size([128, 50, 256])
        encoder_hidden = (hidden_state[-2, :, :] + hidden_state[-1, :, :]).unsqueeze(0)
        # print(f"encoder hidden Shape: {encoder_hidden.shape}")  # torch.Size([1, 128, 256])
        attention_output = self.attnetwork(encoder_out, encoder_hidden)
        # print(f"Attention Output Shape: {attention_output.shape}")  # torch.Size([128, 256])
        logits = self.classifier(attention_output)
        # print(f"Logits Shape: {logits.shape}")  # torch.Size([128, 35])
        return logits

I guess the output of the attention layer should be 3D torch.Size([128, 50, 35]) as for every token of the 50 I have to get probability distribution of which label it should belong it, correct me if I am wrong please

The shape of hidden_state with [num_layers*num_directions=4, batch_size=128, hidden_size=256] (and output), seems to be already wrong, as the batch size changed to 128 from 50.
Note that in the default setup the input to nn.LSTM is expected to have the shape [seq_len, batch_size, features], so you should either permute embeddings or use batch_first=True.