Below is a working example;
in this task each word is associated with a label.
import torch
import torch.nn as nn
# =========================
# Define parameters
# =========================
seq_len = 5 # number of steps for each sentence
batch_size = 100 # batch size
output_size = 128 # hidden size
n_layer = 1 # number of network layers
n_word = 1000 # number of words in the vocabulary
n_label = 10 # number of classes each word can be labeled with
# =========================
# Create dummy data
# =========================
# initialize a one-hot representation
x_onehot = np.zeros((batch_size, seq_len, n_word))
# create dummy input data
x = np.random.randint(n_word, size=(batch_size, seq_len))
y = np.random.randint(n_label, size=(batch_size, seq_len))
# fill in the one-hot vector
for i, sample in enumerate(x):
for t, label in enumerate(sample):
x_onehot[i, t, label] = 1
# convert numpy array to torch tensor
# notice the inputs to LSTM should be torch.FloatTensor
x_onehot = torch.from_numpy(x_onehot).type(torch.FloatTensor)
y = torch.from_numpy(y)
print(x_onehot.size(), y.size())
# torch.Size([100, 5, 1000]) torch.Size([100, 5])
# =========================
# Define model
# =========================
class LSTM_classifier(nn.Module):
def __init__(self, batch_size, input_size, output_size, n_label, n_layer):
super(LSTM_classifier, self).__init__()
self.batch_size = batch_size
self.input_size = input_size
self.output_size = output_size
self.n_label = n_label
self.n_layer = n_layer
self.lstm = nn.LSTM(input_size, output_size, n_layer, batch_first=True)
self.linear = nn.Linear(output_size, n_label)
self.softmax = nn.LogSoftmax(dim=1)
def forward(self, input, hidden):
output, hidden = self.lstm(input, hidden)
linear_output = self.linear(output)
return self.softmax(linear_output), hidden
def init_hidden(self, n_sample=None):
if n_sample:
n_seq = n_sample
else:
n_seq = self.batch_size
h_0 = torch.zeros(self.n_layer, n_seq, self.output_size)
c_0 = torch.zeros(self.n_layer, n_seq, self.output_size)
return h_0, c_0
model = LSTM_classifier(batch_size=batch_size,
input_size=n_word,
output_size=output_size,
n_label=n_label,
n_layer=n_layer)
loss_function = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
# =========================
# Training
# =========================
for i in range(10):
hidden = model.init_hidden()
log_prob, hidden = model(x_onehot, hidden)
loss = 0
for t in range(log_prob.size()[1]):
loss += loss_function(log_prob[:, t, :], y[:, t])
# calculate loss in this way as a workaround for that loss_function(log_prob, y) is not allowed
model.zero_grad()
loss.backward()
optimizer.step()
print(loss.item())
# =========================
# Inference
# =========================
# inference method one: input a single time-step one-by-one
# examining the output log probability gets all zeros
hidden = model.init_hidden(n_sample=1)
for t in range(seq_len):
output, hidden = model(x_onehot[0, t, :].unsqueeze(0).unsqueeze(0), hidden)
print(output.byte().any())
# inference method two: input a sequence at once
# the output log probability seems okay
hidden = model.init_hidden(n_sample=1)
output, hidden = model(x_onehot[0].unsqueeze(0), hidden)
My question is during inference, why using the inference method one gets all-zero log probability outputs in each time step? I imagine that feeding in the sequence at once should get the same result (outputs and hidden states at each time-step) with feeding in a single word step-by-step, which is however not true.
Thanks in advance for any input.