Training LSTM word generator

Ozymandias · January 21, 2022, 8:18pm

I am trying to implement a simple character level names generator, the input word is provided to the model as one hot encoded tensor of shape (word_characters, 24), where the 24 is the size of the vocabulary characters.

class LSTM(nn.Module):
    def __init__(self):
        super(LSTM, self).__init__()
        
        self.n_hidden = 32
        self.n_layers = 3
        
        self.lstm = nn.LSTM(
            input_size=chars_count,
            hidden_size=self.n_hidden,
            num_layers=self.n_layers,
            batch_first=True
        )
        self.linear = nn.Linear(in_features=self.n_hidden, out_features=chars_count)
    
    def forward(self, x):
        h0 = torch.zeros((self.n_layers, x.size(0), self.n_hidden))     # x.size(0) is batch size, i.e. 1
        c0 = torch.zeros((self.n_layers, x.size(0), self.n_hidden))
        
        output, state = self.lstm(x, (h0, c0))
        output = self.linear(output)
        
        return output, state

model = LSTM()
loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for i in range(1000):
    for xi, yi in dataset:
        # add batch size of 1
        xi = xi[None, :, :]  # shape = [1, 7, 24]
        yi = yi[None, :, :]  # shape = [1, 7, 24]

        optimizer.zero_grad()
        l = loss(model(xi)[0], yi)
        l.backward()
        optimizer.step()

Whenever I execute the training loop, I get the following error raised from this line
l = loss(model(xi)[0], yi)


**ValueError** : Expected target size (1, 24), got torch.Size([1, 7, 24])

Can anyone point me what am I doing wrong? the shape of the output of my model is the same as the training label shape.

HallerPatrick · January 22, 2022, 10:34am

Hey,

I stripped down your problem to a minimum and ran it:

import torch
from torch import nn
from torch import optim



class LSTM(nn.Module):
    def __init__(self):
        super(LSTM, self).__init__()
        
        self.n_hidden = 32
        self.n_layers = 3
        
        self.lstm = nn.LSTM(
            input_size=24,
            hidden_size=self.n_hidden,
            num_layers=self.n_layers,
            batch_first=True
        )
        self.linear = nn.Linear(in_features=self.n_hidden, out_features=24)
    
    def forward(self, x):
        h0 = torch.zeros((self.n_layers, x.size(0), self.n_hidden))
        c0 = torch.zeros((self.n_layers, x.size(0), self.n_hidden))
        
        output, state = self.lstm(x, (h0, c0))
        output = self.linear(output)
        
        return output, state

model = LSTM()
loss = nn.CrossEntropyLoss()

t = torch.zeros(1, 7, 24)
t1 = torch.zeros(1, 7, 24)

print(t.size()) # [1, 7, 24]
print(t1.size()) # [1, 7, 24]

output, hidden = model(t)
l = loss(output, t1)

For me this does not throw an error.

You may want to look into your tensor sizes again?

Greetings,
Patrick