In-place operation keeps happening

Hi,

I don’t understand how to deal with the in-palce operations while guarantee autograd happened properly. In my code, I first define 3 mlps with the same structure, then using them to modify tensors.
I’m using tensor.reshape to avoid creating new tensors, but in-place operations keeps hapenning.

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [5, 2, 20]], which is output 0 of torch::autograd::CopySlices, is at version 20; expected version 10 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

    def forward(self, F_ue, Mesg_ue, E, P, Noise):
        '''
        :param F_ue: of size [2MN,K], randomly generated at the first iteration
        :param Mesg_ue: of size[K,M,2MN], randomly generated at the first iteration
        :param E: of size [M,K,2MN]
        :param P: of size [M,1]
        :param Noise: of size [K,1]
        :return: F_ue_update, agg_ue
        '''
        M = P.size()
        M = M[0]
        K = Noise.size()
        K = K[0]
        N = E.size(dim=2)//2
        agg_ue = torch.mean(Mesg_ue, dim=0) # of size [M,2MN]
        Mesg_bs = Mesg_ue.reshape(M,K,2*M*N)
        for m in range(M):
            for k in range(K):
                Mesg_bs[m,k,:] = self.mlp1(E[m,k,:], agg_ue[m,:], P[m], Noise[k])  # it seems like in-place operation happens here
       # Mesg_bs <CopySlices object at 0x2b410bc78c10>
        
        agg_bs = torch.mean(Mesg_bs, dim=0) # of size [K,2MN]
        Mesg_ue = Mesg_bs.reshape(K,M,2*M*N)
        for k in range(K):
            for m in range(M):                
                Mesg_ue[k,m,:] = self.mlp2(F_ue[:,k], E[m,k,:], agg_bs[k,:], P[m], Noise[k])
       # Mesg_ue <AsStridedBackward0 object at 0x2b410bc78c70>

        for k in range(K):
            E_cat = torch.cat([E[m, k, :] for m in range(M)], dim=0)
            F_ue[:,k] = self.mlp3(E_cat, F_ue[:,k], agg_bs[k,:], P[0], Noise[k])
        # F_ue <CopySlices object at 0x2b410bc78c10>

        return F_ue, Mesg_ue

F_ue0 = torch.rand(2 * M * N, K, requires_grad = True)
# F_ue0 <ToCopyBackward0 object at 0x2b410bc78be0>
Mesg_ue0 = torch.rand(K, M, 2 * M * N, requires_grad = True)
# Mesg_ue <ToCopyBackward0 object at 0x2b410bc78be0>
F_ue, Mesg_ue = model(F_ue0, Mesg_ue0, E, P, Noise)
loss = Loss(F_ue)
loss.backward()

Also, since only F_ue is used in calculating loss, I’m curious about whether the parameters of mlp1 and mlp2 will be updated or not.
Thanks for any help.

Hi @sunnylyu,

Can you share the code of the self.mlp1 object? (And the rest of the class too, so it’s a minimal reproducible example).

The stacktrace with the torch.autograd.set_detect_anamoly enabled would be useful too!

Thanks for the fast reply!
Here is my full codes. Please let me know if more info is needed. Thanks very much for your help.

class NodeUpdateLayer(torch.nn.Module):
    def __init__(self, edge_feature_size, num_antenna, num_BS):
        '''
        :param edge_feature_size: int, the dimension of edge_feature, e_mk of size 2*N
        :param num_antenna: int, number of antennas at each BS
        :param num_BS: int, number of BS
        '''
        super(NodeUpdateLayer, self).__init__()

        # MLP for generating message at BS
        self.mlp1 = torch.nn.Sequential(
            Linear(edge_feature_size + 2*num_BS*num_antenna + 2, 512),  # power and noise should also be input
            torch.nn.ReLU(),
            Linear(512, 512),
            torch.nn.ReLU(),
            Linear(512, 512),
            torch.nn.ReLU(),
            Linear(512, 2*num_BS*num_antenna)
        )

        # MLP for generating message at UE
        self.mlp2 = torch.nn.Sequential(
            Linear(2*num_BS*num_antenna + edge_feature_size + 2*num_BS*num_antenna + 2, 512),
            torch.nn.ReLU(),
            Linear(512, 512),
            torch.nn.ReLU(),
            Linear(512, 512),
            torch.nn.ReLU(),
            Linear(512, 2*num_BS*num_antenna)
        )

        # MLP for updating the UE representation
        self.mlp3 = torch.nn.Sequential(
            Linear(2*num_BS*num_antenna + num_BS*edge_feature_size + 2*num_BS*num_antenna + 2, 512),
            torch.nn.ReLU(),
            Linear(512, 512),
            torch.nn.ReLU(),
            Linear(512, 512),
            torch.nn.ReLU(),
            Linear(512, 2*num_BS*num_antenna)
        )

        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self.to(self.device)
        self._wrap_parameters()
        self._init_weights()

    def _wrap_parameters(self):
        for module in self.modules():
            if isinstance(module, Linear):
                module.weight = torch.nn.Parameter(module.weight)
                module.bias = torch.nn.Parameter(module.bias)

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, Linear):
                torch.nn.init.xavier_uniform_(m.weight, gain=1.0)  # 设置gain参数为非零值
                torch.nn.init.constant_(m.bias, 0.4)

    def Message_bs(self, edge_feature, agg_ue, power, noise):
        message_bs = self.mlp1(torch.cat((edge_feature, agg_ue, power, noise), 0))
        if self.device.type == "cuda":
            message_bs = message_bs.to(self.device)
        return message_bs

    def Message_ue(self, f_ue, edge_feature, agg_bs, power, noise):
        message_ue = self.mlp2(torch.cat((f_ue, edge_feature, agg_bs, power, noise), 0))
        if self.device.type == "cuda":
            message_ue = message_ue.to(self.device)
        return message_ue

    def Update(self, E_cat, f_ue, agg_bs, power, noise):
        f_ue_update = self.mlp3(torch.cat((E_cat, f_ue, agg_bs, power, noise), 0))
        if self.device.type == "cuda":
            f_ue_update = f_ue_update.to(self.device)
        return f_ue_update

    def forward(self, F_ue, Mesg_ue, Mesg_bs, E, P, Noise):
        '''
        :param F_ue: of size [2MN,K], randomly generated at the first iteration
        :param Mesg_ue: of size[K,M,2MN], randomly generated at the first iteration
        :param E: of size [M,K,2MN]
        :param P: of size [M,1]
        :param Noise: of size [K,1]
        :return: F_ue_update, agg_ue
        '''
        M = P.size()
        M = M[0]
        K = Noise.size()
        K = K[0]
        N = E.size(dim=2)//2
        agg_ue = torch.mean(Mesg_ue, dim=0) # of size [M,2MN]
        print("first F_ue0", F_ue.grad_fn)
        print("first Mesg_ue0", Mesg_ue.grad_fn)
        print("first Mesg_bs0", Mesg_bs.grad_fn)
        #Mesg_bs = Mesg_ue.reshape(M,K,2*M*N)
        for m in range(M):
            for k in range(K):
                Mesg_bs[m,k,:] = self.Message_bs(E[m,k,:], agg_ue[m,:], P[m], Noise[k])
        print("Mesg_bs", Mesg_bs.grad_fn)

        agg_bs = torch.mean(Mesg_bs, dim=0) # of size [K,2MN]
        #Mesg_ue = Mesg_bs.reshape(K,M,2*M*N)
        for k in range(K):
            for m in range(M):                
                Mesg_ue[k,m,:] = self.Message_ue(F_ue[:,k], E[m,k,:], agg_bs[k,:], P[m], Noise[k])
        print("Mesg_ue", Mesg_ue.grad_fn)

        for k in range(K):
            E_cat = torch.cat([E[m, k, :] for m in range(M)], dim=0)
            F_ue[:,k] = self.Update(E_cat, F_ue[:,k], agg_bs[k,:], P[0], Noise[k])
        print("F_ue", F_ue.grad_fn)

        return F_ue, Mesg_ue