RuntimeError: shape '[-1, 1280, 1]' is invalid for input of size 2776

Hello Everyone,

I am implementing Stack BiLSTM upon the bert sequential output for text classification of dialogue state tracking dataset in encoder. Afterwards the output from Stacked BiLSTM and Bert overall features are concatenating to pass the decoder for mask attention. I am getting the error RuntimeError: shape ‘[-1, 1280, 1]’ is invalid for input of size 2776. The part of code is following where the error is occurring:
class UtteranceAttention(nn.Module):
def init(self, attn_head, model_output_dim, dropout=0.):
super(UtteranceAttention, self).init()

    self.attn_head = attn_head
    self.model_output_dim = model_output_dim
    self.dropout = dropout
    self.attn_fun = MultiHeadAttention(attn_head, model_output_dim, dropout=0.)

def forward(self, query, value, attention_mask=None):
    num_query = query.size(0)
    batch_size = value.size(0)
    seq_length = value.size(1)

    expanded_query = query.unsqueeze(0).expand(batch_size, *query.shape)
    if attention_mask is not None:
        expanded_attention_mask = attention_mask.view(-1, seq_length, 1).expand(value.size()).float()
        new_value = torch.mul(value, expanded_attention_mask)
        attn_mask = attention_mask.unsqueeze(1).expand(batch_size, num_query, seq_length)
    else:
        new_value = value
        attn_mask = None

    attended_embedding = self.attn_fun(expanded_query, new_value, new_value, mask=attn_mask)

    return attended_embedding

class Decoder(nn.Module):
def init(self, attn_head, bert_output_dim, dropout_prob):
super(Decoder, self).init()
# slot utterance attention
self.slot_utter_attn = UtteranceAttention(attn_head, bert_output_dim, dropout=0.)

    # prediction
    self.pred = nn.Sequential(nn.Dropout(p=dropout_prob),
                              nn.Linear(bert_output_dim, bert_output_dim),
                              nn.LayerNorm(bert_output_dim))

def forward(self, sequence_output, attention_mask, slot_embedding):
    # slot utterance attention
    slot_utter_emb = self.slot_utter_attn(slot_embedding, sequence_output, attention_mask)

    # prediction
    hidden = self.pred(slot_utter_emb)

    return hidden  # [batch_size, num_slots, dim]

class BeliefTracker(nn.Module):
def init(self, pretrained_model_type, lstm_hidden_dim, n_layers, lstm_drop, attn_head, dropout = 0.25, num_labels = None, dropout_prob=0.):
super(BeliefTracker, self).init()
self.encoder = UtteranceEncoding.from_pretrained(pretrained_model_type)
self.hidden_size = self.encoder.config.hidden_size
self.rnn = nn.LSTM(self.hidden_size, lstm_hidden_dim, num_layers=n_layers, bidirectional=True,
batch_first=True, dropout=lstm_drop)
#self.pool_dropout = nn.Dropout(self.encoder.config.hidden_dropout_prob)
#self.classifier = nn.Linear(self.encoder.config.hidden_size + 2 * lstm_hidden_dim, num_labels)
#self.dropout = nn.Dropout(dropout)
self.decoder = Decoder(attn_head, self.hidden_size, dropout_prob)

def forward(self, input_ids, attention_mask, token_type_ids, slot_emb):
    # encoder, a pretrained model, output is a tuple
    sequence_output, pooled_output = self.encoder(input_ids, attention_mask, token_type_ids)[0: 2]  # [batch_size, seq_length, dim]
    print(attention_mask.size())
    self.rnn.flatten_parameters()
    _, (hn, cn) = self.rnn(sequence_output)
    hidden = torch.cat((hn[-2, :, :], hn[-1, :, :]), dim=1)
    print(hidden.size())
    #pooled_output = self.pool_dropout(pooled_output)
    pooled_hidden = torch.cat((pooled_output, hidden), dim=1)
    print(pooled_hidden.size())

attention_mask = None

#logits = self.classifier(pooled_hidden)

    # decoder, slot utterance attention, followed by a linear layer        
    slot_output = self.decoder(pooled_hidden, attention_mask, slot_emb)
    return slot_output

Here is the error

Epoch: 0% 0/1 [00:00<?, ?it/s]
0% 0/922 [00:00<?, ?it/s]torch.Size([8, 347])
torch.Size([8, 512])
torch.Size([8, 1280])
0% 0/922 [00:12<?, ?it/s]
Epoch: 0% 0/1 [00:12<?, ?it/s]
Traceback (most recent call last):
File “/content/drive/My Drive/AUX-DST/train-aux.py”, line 264, in
main(args)
File “/content/drive/My Drive/AUX-DST/train-aux.py”, line 150, in main
slot_emb=slot_lookup) # [batch_size, num_slots, dim]
File “/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py”, line 1130, in _call_impl
return forward_call(*input, **kwargs)
File “/content/drive/My Drive/AUX-DST/models/DST.py”, line 155, in forward
slot_output = self.decoder(pooled_hidden, attention_mask, slot_emb)
File “/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py”, line 1130, in _call_impl
return forward_call(*input, **kwargs)
File “/content/drive/My Drive/AUX-DST/models/DST.py”, line 120, in forward
slot_utter_emb = self.slot_utter_attn(slot_embedding, sequence_output, attention_mask)
File “/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py”, line 1130, in _call_impl
return forward_call(*input, **kwargs)
File “/content/drive/My Drive/AUX-DST/models/DST.py”, line 95, in forward
expanded_attention_mask = attention_mask.view(-1, 1280, 1).expand(value.size()).float()
RuntimeError: shape ‘[-1, 1280, 1]’ is invalid for input of size 2776