Storing task specific layers output for further use in multi-task learning

I am trying to train two tasks one after another in multitask learning. I have two CNNs for two tasks. While I am computing task 1 layers I also need to use CNN layers output from the other task. I am getting retain_graph = True error as I initialized all layers output as tensors! I don’t know how to fix it!

# Codes for https://www.aclweb.org/anthology/N18-2114/ / Gated Multi-Task Network for Text Classification
class GCNN_MTL(nn.Module):
    def __init__(self, lins, opt):
        super(GCNN_MTL, self).__init__()
        self.opt = opt
        self.tasks = 2
        self.cnns = clones(CNN_Part(opt.EMB_DIM, opt.NUM_HIDDEN, opt), self.tasks)
        self.linears = lins
        self.sig_gate = Sigma_Gate()
        self.conv_out = [[torch.zeros(self.opt.NUM_FILTER, self.opt.LENGTH - 1).cuda(),
                         torch.zeros(self.opt.NUM_FILTER, self.opt.LENGTH - 2).cuda(),
                         torch.zeros(self.opt.NUM_FILTER, self.opt.LENGTH - 3).cuda()]] * self.tasks
        self.pool_out = [[torch.zeros(self.opt.NUM_FILTER).cuda(), torch.zeros(self.opt.NUM_FILTER).cuda(),
                         torch.zeros(self.opt.NUM_FILTER).cuda()]] * self.tasks
        self.fc_out = [torch.zeros(self.opt.MID_DIM).cuda()] * self.tasks

    def forward(self, w_emb, num_task):
        self.conv_out[num_task] = self.cnns[num_task](w_emb)  # [B, Co, L] * K
        g_conv_out = [self.sig_gate(x, y) for x, y in zip(self.conv_out[num_task], self.conv_out[1 - num_task])]
        self.pool_out[num_task] = [F.max_pool1d(x, x.size(2)).squeeze(2) for x in g_conv_out]  # [B, Co] * K
        g_pool_out = [self.sig_gate(x, y) for x, y in zip(self.pool_out[num_task], self.pool_out[1 - num_task])]
        g_pool_out = torch.cat(g_pool_out, dim=1)  # [B, Co * K]

        self.fc_out[num_task] = self.linears[num_task](g_pool_out)
        gated_fc_out = self.sig_gate(self.fc_out[num_task], self.fc_out[1 - num_task])
        return gated_fc_out