Hi ! I’m trying to experiment with LSTMs for NLP (I’m trying it on a text classification task.
I managed to get my model to train, but I run out of memory after 4 to 5 epochs. ( I get a Cuda runtime error (2): out of memory
From what I understood I need to make sure to free some of my tensors between epochs however I don’t really get how to do it yet
Here is my model :
class LSTMClassifier(nn.Module):
def __init__(self, embedding_dim, hidden_dim, vocab_size, label_size, batch_size, use_gpu):
super(LSTMClassifier, self).__init__()
self.hidden_dim = hidden_dim
self.batch_size = batch_size
self.use_gpu = use_gpu
self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim)
self.hidden2label = nn.Linear(hidden_dim, label_size)
self.hidden = self.init_hidden()
self.logsoftmax = nn.LogSoftmax()
if self.use_gpu:
self.lstm.cuda()
self.word_embeddings.cuda()
self.hidden2label.cuda()
def init_hidden(self):
if self.use_gpu:
h0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim).cuda())
c0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim).cuda())
else:
h0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
c0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
return (h0, c0)
def forward(self, sentence):
embeds = self.word_embeddings(sentence)
dim = embeds.size()
x = embeds.view(dim[1], dim[0], dim[-1])
lstm_out, self.hidden = self.lstm(x, self.hidden)
y = self.hidden2label(lstm_out[-1])
y = self.logsoftmax(y)
return y
And here’s my training loop :
from IPython.display import clear_output
losses = []
mean_loss = 0
model = LSTMClassifier(embedding_dim=50, hidden_dim=50, vocab_size=35562,
label_size=8, batch_size=44, use_gpu=True)
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters())
num_epochs = 25
for epoch in range(num_epochs):
i = 0
print('Epoch [{}/{}], Mean Loss {:.4f}, Validation score {:.4f}'
.format(epoch+1, num_epochs, mean_loss,
evaluate_model(model, 'validation', 44)))
while 'computing the dataset by 44-size minibatches':
if(i > 1539):
i = 0
break
else:
mini_batch = preprocess.get_data_batch(i, 44, inputs_train)
text = torch.from_numpy(mini_batch[0]).long().cuda()
labels = torch.from_numpy(mini_batch[1]).long().cuda()
# Forward pass
model.train()
outputs = model(text)
loss = criterion(outputs, labels)
losses.append(loss)
mean_loss = sum(losses) / len (losses)
# Backward and optimize
optimizer.zero_grad()
loss.backward(retain_graph=True)
print ('optimized')
optimizer.step()
i += 44
As you can see i’m using retain_graph=True
. I believe it’s where the memory leaks come from, however the model won’t train past the first iteration if it’s not present (I get a RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.
)
I believe there’s something simple I’m not doing right, so thanks in advance for helping me !
EDIT : I figured I forgot the model.hidden = model.init_hidden()
in my training loop, however I still run in the exact same issue