Hi folks,
After reading some tutorials, I did a minimal example aiming to classify (binary) an input sequence:
class LSTM_Seq(nn.Module):
def __init__(self, input_size, hidden_size, n_layers, output_size):
super().__init__()
self.hidden_size = hidden_size
self.n_layers = n_layers
self.lstm = nn.LSTM(input_size, hidden_size, num_layers=n_layers, bidirectional=False)
self.linear = nn.Linear(hidden_size, output_size)
def init_hidden(self, n_layers, batch_size, hidden_size):
return (torch.zeros(n_layers, batch_size, hidden_size),
torch.zeros(n_layers, batch_size, hidden_size) )
def forward(self, x):
# input should be (sequence, batch, number_of_features)
x = x.permute(1, 0, 2)
mini_batch = x.size(1)
self.hidden = self.init_hidden(self.n_layers, mini_batch, self.hidden_size)
out, self.hidden = self.lstm(x, self.hidden)
out = self.linear(out[-1])
out = F.log_softmax(out, dim=1)
return out
model = LSTM_Seq(input_size=2, hidden_size=100, n_layers=1, output_size=2)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
# %%
torch.manual_seed(123)
# 10000 samples of sequence dims= 500, number of features=2
x_train = torch.rand(10000, 500, 2).float()
y_train = (torch.rand(10000) > 0.4).long()
batch_size = 32
train_dataset = torch.utils.data.TensorDataset(x_train, y_train)
train_dataloader = torch.utils.data.DataLoader(train_dataset,
batch_size=batch_size, shuffle=True) # create your dataloader
total_train_batch = len(train_dataloader)
# %%
for epoch in range(1):
running_loss = 0.0
iter_train_dataset = iter(train_dataloader)
for k in range(total_train_batch):
seqs, labels = next(iter_train_dataset)
optimizer.zero_grad()
outputs = model(seqs)
loss = F.nll_loss(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
if k % 50 == 0: # print every mini-batches
print('[%d, %3d] loss: %.3f' %
(epoch + 1, k + 1, running_loss / 50))
running_loss = 0.0
However, the model seems not learning, it shows:
[1, 1] loss: 0.014
[1, 51] loss: 0.673
[1, 101] loss: 0.668
[1, 151] loss: 0.665
[1, 201] loss: 0.677
[1, 251] loss: 0.674
[1, 301] loss: 0.680
-
I verified the
loss
has a gradient function, the loss should be reduced in some sense. -
How the hidden states works, how can I modify my code in order to get
transferred between timestep to timestep