Hello everyone. Recently, I implemented a simple recursive neural network. When training this model on sample/small data set, everything works fine. However, when training it on large data and on GPUs, “out of memory” is raised. Along with the training goes on, usage of GPU memory keeps growing up. So, I want to know, why does this happen? I would be grateful if you could help.
The model and training procedure are defined as follow:
def train_step(self, data):
train_loss = 0
for _data in data:
p_tree = _data['p_tree']
h_tree = _data['h_tree']
if args.cuda:
target = Variable(torch.LongTensor([_data['label']]).cuda())
else:
target = Variable(torch.LongTensor([_data['label']]))
self.optimizer.zero_grad()
# self.model is an instance of class RootAlign
output = self.model(p_tree, h_tree)
loss = F.nll_loss(output, target)
loss.backward()
self.optimizer.step()
train_loss += loss.data[0]
return train_loss
class RootAlign(nn.Module):
def __init__(self, word_embedding, config):
super(RootAlign, self).__init__()
self.rnn = VanillaRecursiveNN(word_embedding, config['hidden_dim'], config['cuda_flag'])
self.linear = nn.Linear(config['hidden_dim'] * 2, config['relation_num'])
def forward(self, p_tree, h_tree):
p_tree.postorder_traverse(self.rnn)
h_tree.postorder_traverse(self.rnn)
out = F.log_softmax(self.linear(F.sigmoid(torch.cat((p_tree.calculate_result, h_tree.calculate_result), 1))))
return out
class VanillaRecursiveNN(nn.Module):
def __init__(self, word_embedding, hidden_dim, cuda_flag=False):
super(VanillaRecursiveNN, self).__init__()
self.word_dim = word_embedding.embeddings.size(1)
self.hidden_dim = hidden_dim
self.embedding = nn.Embedding(word_embedding.embeddings.size(0),
self.word_dim)
self.embedding.weight = nn.Parameter(word_embedding.embeddings)
self.word2hidden = nn.Linear(self.word_dim, self.hidden_dim, False)
self.hidden2hidden = nn.Linear(2 * self.hidden_dim, self.hidden_dim)
self.cuda_flag = cuda_flag
def forward(self, node):
if not node.val is None:
if self.cuda_flag:
node.calculate_result = self.word2hidden(
self.embedding(Variable(torch.LongTensor([node.word_id]).cuda())))
else:
node.calculate_result = self.word2hidden(
self.embedding(Variable(torch.LongTensor([node.word_id]))))
return node.calculate_result
else:
assert len(node.children) == 2
node.calculate_result = self.hidden2hidden(torch.cat((node.children[0].calculate_result,
node.children[1].calculate_result), 1))
return node.calculate_result