I am trying to run some sequences in an LSTM model with 8xV100 GPU’s in an Amazon Sagemaker instance. After just about 30-40 or so batches (batch size 32), I get a CUDA out of memory error. I think there is a memory leak somewhere but I’m new to Pytorch and can’t figure it out.
My Model:
# Class containing the LSTM model initialization and feed-forward logic
class LSTMClassifier(nn.Module):
# LSTM initialization
def __init__(self, embedding_dim, hidden_dim, vocab_size, label_size, static_size):
super(LSTMClassifier, self).__init__()
# Setting the hidden layer dimension of the LSTM
self.hidden_dim = hidden_dim
# Initializing the embedding layer
self.embeddings = nn.Embedding(vocab_size, embedding_dim-2)
# Initializing the LSTM layer with one hidden layer
self.lstm = nn.LSTM(((embedding_dim*vocab_size)+static_size), hidden_dim, num_layers = 1, batch_first=True)
# Initializing linear linear that takes the hidden layer output
self.hidden2label = nn.Linear(hidden_dim, label_size)
# Defining the hidden state of the LSTM
def init_hidden(self, batch_size):
# the first is the hidden h
# the second is the cell c
return [autograd.Variable(torch.zeros(batch_size, 1, self.hidden_dim).cuda()),
autograd.Variable(torch.zeros(batch_size, 1, self.hidden_dim).cuda())]
# Defining the feed forward logic of the LSTM. It contains:
# 1. The embedding layer
# 2. The LSTM layer with one hidden layer
# 3. The softmax layer
def forward(self, seq, freq, time, static):
# Grab the mini-batch length and max sequence length (pre-ordered)
# (need to do this in the forward logic because of data parallelism and how the GPU's will split up the batch)
sequence_length = seq.size()[1]
batch_length = seq.size()[0]
# reset the LSTM hidden state.
# Must be done before you run a new batch. Otherwise the LSTM will treat a new batch as a continuation of a sequence
self.hidden = self.init_hidden(batch_length)
# Permute the cell and hidden layers. This is because when using Batch_first = True on data parallel,
# the hidden state will still expect an input of (nLayer, batch size, hidden dim), but we are feeding it (batch size, nLayer, hidden dim)
# Thus, to fix it, we need to swap the first and 2nd inputs before feeding to hidden dim
self.hidden[0] = self.hidden[0].permute(1, 0, 2).contiguous()
self.hidden[1] = self.hidden[1].permute(1, 0, 2).contiguous()
# This is the pass to the embedding layer.
# The sequence is of dimension N and the output is N x Demb
embeds = self.embeddings(seq)
# Concatenate the embedding output with the time and frequency vectors
embeds = torch.cat((embeds,freq), dim=3)
embeds = torch.cat((embeds,time), dim=3)
# Flatten the tensor
x = embeds.view(batch_length, sequence_length, -1)
# Concatenate the static information
x = torch.cat((x, static), dim=2)
# Grab the list of lengths of sequences, for the purpose of packing the padded sequenes
seq_lengths = torch.LongTensor(list(map(len, seq)))
# pack the padded sequence so that paddings are ignored
x = torch.nn.utils.rnn.pack_padded_sequence(x, seq_lengths, batch_first=True)
# Feed to the LSTM layer
self.lstm.flatten_parameters()
lstm_out, self.hidden = self.lstm(x, self.hidden)
# Swap back the 1st and 2nd inputs to the hidden layer back to its original configuration
self.hidden = list(self.hidden)
self.hidden[0] = self.hidden[0].permute(1, 0, 2).contiguous()
self.hidden[1] = self.hidden[1].permute(1, 0, 2).contiguous()
# Unpack the packed padded sequence so that it is ready for prediction
lstm_out, input_sizes = torch.nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)
# Feed the last layer of the LSTM into the linear layer
y = self.hidden2label(lstm_out[:,-1,:])
# Produce the softmax probabilities
log_probs = F.log_softmax(y)
return log_probs
and my training loop:
#############################
### Set hyper parameters ###
############################
torch.cuda.empty_cache()
EMBEDDING_DIM = 32
HIDDEN_DIM = 50
EPOCH = 10
BATCH_SIZE = 32
best_val_auc = 0.0
model = LSTMClassifier(embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, vocab_size=len(events_to_ix), label_size=len(targets_to_ix), static_size=(len(gender_to_ix)+1))
model = torch.nn.DataParallel(model).cuda()
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
no_up = 0
#####################################################
### Set loop to determine number of EPOCHs to run ###
#####################################################
for i in range(EPOCH):
#############################################
### Run the training on the training data ###
#############################################
# Shuffle the data
train_data = train_data.sample(frac=1).reset_index(drop=True)
print('epoch: %d start!' % i)
start = time.time()
# Perform the training on the epoch
model.train()
avg_loss = 0.0
count = 0
truth_res = []
pred_res = []
# Group the dataframe into dataframe chunks of length batch size and loop through each batch
for index, rows in train_data.groupby(np.arange(len(train_data)) // batch_size):
torch.cuda.empty_cache()
# Grab the targets into a list and append it into the truth_res list in order to measure AUC performance
target = [targets_to_ix[target] for target in rows['event_target']]
truth_res.extend(target)
# Encode the data and output to tensors (based on the previous description)
seq, freq, time_data, static = encode_data(rows, events_to_ix)
# Pad the sequences
seq = rnn_utils.pad_sequence(seq, batch_first = True)
freq = rnn_utils.pad_sequence(freq, batch_first = True)
time_data = rnn_utils.pad_sequence(time_data, batch_first = True)
static = rnn_utils.pad_sequence(static, batch_first = True)
# Put the padded sequences into Variable and Cuda cores
seq = autograd.Variable(seq.cuda())
freq = autograd.Variable(freq.cuda())
time_data = autograd.Variable(time_data.cuda())
static = autograd.Variable(static.cuda())
target = autograd.Variable(torch.LongTensor(target).cuda())
# Feed the tensor Variables into the model
pred = model(seq,freq,time_data,static)
# Append the predictions into a list for future AUC evaluation
pred_label = pred.data.max(1)[1].cpu().numpy()
pred_res.extend(pred_label)
# Reset the model gradient
model.zero_grad()
# Compute the loss
loss = loss_function(pred, target)
# Backpropagate
loss.backward()
# Update weights
optimizer.step()
# Computes the average loss
avg_loss += loss.data.item()
# Computes the AUC score
auc_score = get_auroc(truth_res, pred_res)
# Print out progress
count += batch_size
if count % 512 == 0:
print('epoch: %d iterations: %d loss :%g' % (i, count, loss.data.item()))
avg_loss /= len(train_data)
print('epoch: %d done! \n train avg_loss:%g , auc:%g' % (i, avg_loss, auc_score))
print("1 epoch length of time")
print(time.time() - start)
######################################################
### Evaluate the model and test if modele improved ###
######################################################
print('now best val auc:', best_dev_acc)
val_auc = evaluate(model, val_data, loss_function, events_to_ix, targets_to_ix, BATCH_SIZE)
#################################################
### If model is improved, then save the model ###
#################################################
if val_auc > best_val_auc:
best_dev_acc = dev_acc
os.system('rm mr_best_model_auc_*.model')
print('New Best Val AUC!')
torch.save(model.state_dict(), 'best_models/mr_best_model_auc_' + str(int(dev_acc * 100)) + '.model')
no_up = 0
else:
no_up += 1
if no_up >= 10:
exit()
Any help would be much appreciated. I suspect it has something to do with copying many instances of Variables but I dont know enough about it to identify the exact issue