Out of Memory Issue when using DataParallel (LSTM)

Hello, I’ve been trying to run the model using dataparallel, however I am facing a challenge.
The issue of Out of Memory comes up whenever I train, even with batch size 3(I use 3 GPUs so it would be 1 batch for each GPU).

I already tried using self.lstm.flattened_parameters(), and this would not fix the problem.

I was looking for more solutions, and I found out that calculating the loss in the model.forward might solve the problem. However, I do not know how.

Would there be another better way of solving this issue?
or can anyone help me calculate the loss in the model.forward?

The code for model is as below

class SpeakerDecoder(nn.Module):
    def __init__(self, vocab_size, embedding_size, padding_idx, hidden_size, dropout_ratio): #hidden size = rnn_dim = 512
        super(SpeakerDecoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = torch.nn.Embedding(vocab_size, embedding_size, padding_idx)
        self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first=True)
        self.drop = nn.Dropout(dropout_ratio)
        self.attention_layer = SoftDotAttention(hidden_size)
        self.projection = nn.Linear(hidden_size, vocab_size)

    def forward(self, words, ctx, ctx_mask, h0, c0): 
        embeds = self.embedding(words)
        embeds = self.drop(embeds)
        self.lstm.flatten_parameters()
        x, (h1, c1) = self.lstm(embeds, (h0.view(1, 3, 512), c0.view(1, 3, 512)))

        x = self.drop(x)

        # Get the size
        batchXlength = words.size(0) * words.size(1)
        multiplier = batchXlength // ctx.size(0)         # By using this, it also supports the beam-search

        # Att and Handle with the shape
        # Reshaping x          <the output> --> (b(word)*l(word), r)
        # Expand the ctx from  (b, a, r)    --> (b(word)*l(word), a, r)
        # Expand the ctx_mask  (b, a)       --> (b(word)*l(word), a)
        # print("IN:")
        # print(x.contiguous().view(batchXlength, self.hidden_size).size())
        # print(ctx.unsqueeze(1).expand(-1, multiplier, -1, -1).contiguous(). view(batchXlength, -1, self.hidden_size).size())
        x, _ = self.attention_layer(
            x.contiguous().view(batchXlength, self.hidden_size),
            ctx.unsqueeze(1).expand(-1, multiplier, -1, -1).contiguous(). view(batchXlength, -1, self.hidden_size),
            mask=ctx_mask.unsqueeze(1).expand(-1, multiplier, -1).contiguous().view(batchXlength, -1)
        )
        x = x.view(words.size(0), words.size(1), self.hidden_size)

        # Output the prediction logit
        x = self.drop(x)
        logit = self.projection(x)

        return logit, h1, c1

The code for training loop is as below.

def train(self, n_iters, feedback='teacher'):
        ''' Train for a given number of iterations '''
        assert feedback in self.feedback_options
        self.feedback = feedback
        self.encoder.train()
        self.decoder.train()
        if self.use_rl:
            self.critic.train()
        self.losses = []
        for iter in range(1, n_iters + 1):
            self.encoder_optimizer.zero_grad()
            self.decoder_optimizer.zero_grad()
            if self.use_rl:
                self.critic_optimizer.zero_grad()
            self.rollout(speaker_branching=self.train_branching)
            self.loss.backward()
            self.encoder_optimizer.step()
            self.decoder_optimizer.step()
            if self.use_rl:
                self.critic_optimizer.step()
            if self.random_start:
                losses = [x for x in self.losses]
                self.encoder_optimizer.zero_grad()
                self.decoder_optimizer.zero_grad()
                viewpointIds = self.env.random_start(self.J)
                self.rollout(reset=False)
                self.env.reset_viewpointIds(viewpointIds)
                self.loss.backward()
                self.encoder_optimizer.step()
                self.decoder_optimizer.step()
                self.losses = losses

The problem that I face is as below.

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 1; 10.75 GiB total capacity; 9.11 GiB already allocated; 3.56 MiB free; 9.93 GiB reserved in total by PyTorch)

Would really appreciate it if I could get some help. Thank you.