Hey there,
I’m coding my first LSTM and am having issues getting the network to train. The network is meant to perform a binary classification task.
It seems there’s some issue with backpropagating the loss as the gradients I see for the network’s parameters are usually extremely small (<e-03). In particular, I’ve noticed that even after backpropagation, output.grad and hidden.grad are both None, which doesn’t seem right.
Here’s the code for defining the network:
class LSTM(nn.Module):
def __init__(self, input_size, embedding_dim,hidden_size, output_size, batch_size):
super(LSTM, self).__init__()
self.input_size= input_size
self.embedding_size = embedding_dim
self.hidden_size = hidden_size
self.output_size = 1
self.batch_size = batch_size
self.linear_f = nn.Linear(embedding_dim + hidden_size, hidden_size)
self.linear_i = nn.Linear(embedding_dim + hidden_size, hidden_size)
self.linear_ctilde = nn.Linear(embedding_dim + hidden_size, hidden_size)
self.linear_o = nn.Linear(embedding_dim + hidden_size, hidden_size)
self.decoder = nn.Linear(hidden_size, output_size)
self.init_weights()
self.length=None
def forward(self, x, hidden,c):
x_emb = x
length=x_emb.shape[0]
embs = torch.chunk(x_emb, self.length, 1)
outputs=[]
def step(emb, hid, c_t):
combined = torch.cat((hid, emb), 1)
f = torch.sigmoid(self.linear_f(combined))
i = torch.sigmoid(self.linear_i(combined))
c_tilde = torch.tanh(self.linear_ctilde(combined))
c_t = f * c_t + i * c_tilde
o = torch.sigmoid(self.linear_o(combined))
hid = o * torch.tanh(c_t)
return hid, c_t
for i in range(len(embs)):
hidden, c = step(embs[i], hidden, c)
decoded=self.decoder(hidden)
output=torch.softmax(decoded,1)
return output, hidden
def init_hidden(self):
h0 = torch.zeros(self.batch_size, self.hidden_size,requires_grad=True)
c0 = torch.zeros(self.batch_size, self.hidden_size,requires_grad=True)
return h0, c0
def init_weights(self):
initrange = .1
lin_layers = [self.linear_f, self.linear_i, self.linear_ctilde, self.linear_o, self.decoder]
for layer in lin_layers:
layer.weight.data.uniform_(-initrange, initrange)
if layer in lin_layers:
layer.bias.data.fill_(0)
And here’s the function I’m using to train the network:
def training_loop(batch_size, num_epochs, model, loss_, optim, training_iter, dev_iter, train_eval_iter,verbose,end_early):
step = 0
epoch = 0
total_batches = int(len(training_set) / batch_size)
epoch_loss=[]
start_time=time.time()
outputs=[]
ground_truths=[]
last_fi=model.linear_f.weight
while epoch <= num_epochs:
model.train()
x=next(training_iter)
vectors, conversions = get_batch(x)
vectors = torch.stack(vectors).view([len(vectors),len(vectors[0])]).float() # batch_size, seq_len
conversions = torch.stack(conversions).long().view([batch_size])
model.length=len(vectors[0])
hidden, cell_state = model.init_hidden()
output, hidden = model(vectors, hidden, cell_state)
lossy= loss_(output, conversions)
lossy.backward()
print(output.grad)
optim.step()
model.zero_grad()
if step % total_batches == 0:
if not epoch%1:
model.eval()
print("Epoch %i; Step %i; Loss %f; Train acc: %f; Dev acc %f"
%(epoch, step, item,\
evaluate(model, train_eval_iter, lstm),\
evaluate(model, dev_iter, lstm)))
print('')
step += 1
Note: Since the sequences are of varying lengths and I didn’t want to do zero-padding (it seemed like it would alter the meaning of the data) I use a batch size of 1 and pass the length of each sequence by setting the model’s ‘length’ attribute