Problem with dimension size LSTM

I get this error size mismatch, m1: [50 x 52480], m2: [256 x 2] and try to fix it by change linear dim to [52480 x 2] it can train but always get 0 loss

Here is my code.

class LSTMtagger(nn.Module):

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, num_layers):
        super(LSTMtagger, self).__init__()

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, dropout = 0.5)

        self.dropout = nn.Dropout(p=0.3)
        # The linear layer that maps from hidden state space to tag space
        self.fc = nn.Linear(hidden_dim, output_size)
        
        self.sigmoid = nn.Sigmoid()

    def forward(self, sentence):
        print(sentence.size())
        embeds = self.word_embeddings(sentence)
        print(embeds.size())
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 205, -1))
        print(lstm_out.size())
        d_out = self.dropout(lstm_out)
       
        
#         d_out = d_out.view(50*256,205,1)
#         print(d_out.size())
        tag_space = self.fc(d_out.view(len(sentence), -1))
        tag_scores = F.sigmoid(tag_space)
        print(tag_scores)
        return tag_scores
vocab_size = len(vocab_to_int) + 1 # +1 for the 0 padding
output_size = 2
embedding_dim = 205
hidden_dim = 256
num_layers = 2

net = LSTMtagger(vocab_size, output_size, embedding_dim, hidden_dim, num_layers)
print(net)
model = LSTMtagger(vocab_size, output_size, embedding_dim, hidden_dim, num_layers)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

Your current code throws an error, if I pass an input of shape [50, 256] to it in the self.lstm layer.
For the batch_first=True argument, the input should be [batch_size, seq, feature], so just passing embeds without the view gets rid of this error.

If you would like to use the output features of all time samples, setting in_features=256*256 should work, as you already tried.

That being said, if you are using nn.NLLLoss, you should apply F.log_softmax on your output instead of F.sigmoid.
Could you change that and try to train your model again?