Hello, I’ve been trying to run the model using dataparallel, however I am facing a challenge.
The issue of Out of Memory comes up whenever I train, even with batch size 3(I use 3 GPUs so it would be 1 batch for each GPU).
I already tried using self.lstm.flattened_parameters()
, and this would not fix the problem.
I was looking for more solutions, and I found out that calculating the loss in the model.forward might solve the problem. However, I do not know how.
Would there be another better way of solving this issue?
or can anyone help me calculate the loss in the model.forward?
The code for model is as below
class SpeakerDecoder(nn.Module):
def __init__(self, vocab_size, embedding_size, padding_idx, hidden_size, dropout_ratio): #hidden size = rnn_dim = 512
super(SpeakerDecoder, self).__init__()
self.hidden_size = hidden_size
self.embedding = torch.nn.Embedding(vocab_size, embedding_size, padding_idx)
self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first=True)
self.drop = nn.Dropout(dropout_ratio)
self.attention_layer = SoftDotAttention(hidden_size)
self.projection = nn.Linear(hidden_size, vocab_size)
def forward(self, words, ctx, ctx_mask, h0, c0):
embeds = self.embedding(words)
embeds = self.drop(embeds)
self.lstm.flatten_parameters()
x, (h1, c1) = self.lstm(embeds, (h0.view(1, 3, 512), c0.view(1, 3, 512)))
x = self.drop(x)
# Get the size
batchXlength = words.size(0) * words.size(1)
multiplier = batchXlength // ctx.size(0) # By using this, it also supports the beam-search
# Att and Handle with the shape
# Reshaping x <the output> --> (b(word)*l(word), r)
# Expand the ctx from (b, a, r) --> (b(word)*l(word), a, r)
# Expand the ctx_mask (b, a) --> (b(word)*l(word), a)
# print("IN:")
# print(x.contiguous().view(batchXlength, self.hidden_size).size())
# print(ctx.unsqueeze(1).expand(-1, multiplier, -1, -1).contiguous(). view(batchXlength, -1, self.hidden_size).size())
x, _ = self.attention_layer(
x.contiguous().view(batchXlength, self.hidden_size),
ctx.unsqueeze(1).expand(-1, multiplier, -1, -1).contiguous(). view(batchXlength, -1, self.hidden_size),
mask=ctx_mask.unsqueeze(1).expand(-1, multiplier, -1).contiguous().view(batchXlength, -1)
)
x = x.view(words.size(0), words.size(1), self.hidden_size)
# Output the prediction logit
x = self.drop(x)
logit = self.projection(x)
return logit, h1, c1
The code for training loop is as below.
def train(self, n_iters, feedback='teacher'):
''' Train for a given number of iterations '''
assert feedback in self.feedback_options
self.feedback = feedback
self.encoder.train()
self.decoder.train()
if self.use_rl:
self.critic.train()
self.losses = []
for iter in range(1, n_iters + 1):
self.encoder_optimizer.zero_grad()
self.decoder_optimizer.zero_grad()
if self.use_rl:
self.critic_optimizer.zero_grad()
self.rollout(speaker_branching=self.train_branching)
self.loss.backward()
self.encoder_optimizer.step()
self.decoder_optimizer.step()
if self.use_rl:
self.critic_optimizer.step()
if self.random_start:
losses = [x for x in self.losses]
self.encoder_optimizer.zero_grad()
self.decoder_optimizer.zero_grad()
viewpointIds = self.env.random_start(self.J)
self.rollout(reset=False)
self.env.reset_viewpointIds(viewpointIds)
self.loss.backward()
self.encoder_optimizer.step()
self.decoder_optimizer.step()
self.losses = losses
The problem that I face is as below.
RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 1; 10.75 GiB total capacity; 9.11 GiB already allocated; 3.56 MiB free; 9.93 GiB reserved in total by PyTorch)
Would really appreciate it if I could get some help. Thank you.