In-place operation keeps happening

sunnylyu · July 20, 2023, 9:41pm

Hi,

I don’t understand how to deal with the in-palce operations while guarantee autograd happened properly. In my code, I first define 3 mlps with the same structure, then using them to modify tensors.
I’m using tensor.reshape to avoid creating new tensors, but in-place operations keeps hapenning.

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [5, 2, 20]], which is output 0 of torch::autograd::CopySlices, is at version 20; expected version 10 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

    def forward(self, F_ue, Mesg_ue, E, P, Noise):
        '''
        :param F_ue: of size [2MN,K], randomly generated at the first iteration
        :param Mesg_ue: of size[K,M,2MN], randomly generated at the first iteration
        :param E: of size [M,K,2MN]
        :param P: of size [M,1]
        :param Noise: of size [K,1]
        :return: F_ue_update, agg_ue
        '''
        M = P.size()
        M = M[0]
        K = Noise.size()
        K = K[0]
        N = E.size(dim=2)//2
        agg_ue = torch.mean(Mesg_ue, dim=0) # of size [M,2MN]
        Mesg_bs = Mesg_ue.reshape(M,K,2*M*N)
        for m in range(M):
            for k in range(K):
                Mesg_bs[m,k,:] = self.mlp1(E[m,k,:], agg_ue[m,:], P[m], Noise[k])  # it seems like in-place operation happens here
       # Mesg_bs <CopySlices object at 0x2b410bc78c10>
        
        agg_bs = torch.mean(Mesg_bs, dim=0) # of size [K,2MN]
        Mesg_ue = Mesg_bs.reshape(K,M,2*M*N)
        for k in range(K):
            for m in range(M):                
                Mesg_ue[k,m,:] = self.mlp2(F_ue[:,k], E[m,k,:], agg_bs[k,:], P[m], Noise[k])
       # Mesg_ue <AsStridedBackward0 object at 0x2b410bc78c70>

        for k in range(K):
            E_cat = torch.cat([E[m, k, :] for m in range(M)], dim=0)
            F_ue[:,k] = self.mlp3(E_cat, F_ue[:,k], agg_bs[k,:], P[0], Noise[k])
        # F_ue <CopySlices object at 0x2b410bc78c10>

        return F_ue, Mesg_ue

F_ue0 = torch.rand(2 * M * N, K, requires_grad = True)
# F_ue0 <ToCopyBackward0 object at 0x2b410bc78be0>
Mesg_ue0 = torch.rand(K, M, 2 * M * N, requires_grad = True)
# Mesg_ue <ToCopyBackward0 object at 0x2b410bc78be0>
F_ue, Mesg_ue = model(F_ue0, Mesg_ue0, E, P, Noise)
loss = Loss(F_ue)
loss.backward()

Also, since only F_ue is used in calculating loss, I’m curious about whether the parameters of mlp1 and mlp2 will be updated or not.
Thanks for any help.

AlphaBetaGamma96 · July 20, 2023, 11:00pm

Hi @sunnylyu,

Can you share the code of the self.mlp1 object? (And the rest of the class too, so it’s a minimal reproducible example).

The stacktrace with the torch.autograd.set_detect_anamoly enabled would be useful too!

sunnylyu · July 20, 2023, 11:14pm

Thanks for the fast reply!
Here is my full codes. Please let me know if more info is needed. Thanks very much for your help.

class NodeUpdateLayer(torch.nn.Module):
    def __init__(self, edge_feature_size, num_antenna, num_BS):
        '''
        :param edge_feature_size: int, the dimension of edge_feature, e_mk of size 2*N
        :param num_antenna: int, number of antennas at each BS
        :param num_BS: int, number of BS
        '''
        super(NodeUpdateLayer, self).__init__()

        # MLP for generating message at BS
        self.mlp1 = torch.nn.Sequential(
            Linear(edge_feature_size + 2*num_BS*num_antenna + 2, 512),  # power and noise should also be input
            torch.nn.ReLU(),
            Linear(512, 512),
            torch.nn.ReLU(),
            Linear(512, 512),
            torch.nn.ReLU(),
            Linear(512, 2*num_BS*num_antenna)
        )

        # MLP for generating message at UE
        self.mlp2 = torch.nn.Sequential(
            Linear(2*num_BS*num_antenna + edge_feature_size + 2*num_BS*num_antenna + 2, 512),
            torch.nn.ReLU(),
            Linear(512, 512),
            torch.nn.ReLU(),
            Linear(512, 512),
            torch.nn.ReLU(),
            Linear(512, 2*num_BS*num_antenna)
        )

        # MLP for updating the UE representation
        self.mlp3 = torch.nn.Sequential(
            Linear(2*num_BS*num_antenna + num_BS*edge_feature_size + 2*num_BS*num_antenna + 2, 512),
            torch.nn.ReLU(),
            Linear(512, 512),
            torch.nn.ReLU(),
            Linear(512, 512),
            torch.nn.ReLU(),
            Linear(512, 2*num_BS*num_antenna)
        )

        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self.to(self.device)
        self._wrap_parameters()
        self._init_weights()

    def _wrap_parameters(self):
        for module in self.modules():
            if isinstance(module, Linear):
                module.weight = torch.nn.Parameter(module.weight)
                module.bias = torch.nn.Parameter(module.bias)

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, Linear):
                torch.nn.init.xavier_uniform_(m.weight, gain=1.0)  # 设置gain参数为非零值
                torch.nn.init.constant_(m.bias, 0.4)

    def Message_bs(self, edge_feature, agg_ue, power, noise):
        message_bs = self.mlp1(torch.cat((edge_feature, agg_ue, power, noise), 0))
        if self.device.type == "cuda":
            message_bs = message_bs.to(self.device)
        return message_bs

    def Message_ue(self, f_ue, edge_feature, agg_bs, power, noise):
        message_ue = self.mlp2(torch.cat((f_ue, edge_feature, agg_bs, power, noise), 0))
        if self.device.type == "cuda":
            message_ue = message_ue.to(self.device)
        return message_ue

    def Update(self, E_cat, f_ue, agg_bs, power, noise):
        f_ue_update = self.mlp3(torch.cat((E_cat, f_ue, agg_bs, power, noise), 0))
        if self.device.type == "cuda":
            f_ue_update = f_ue_update.to(self.device)
        return f_ue_update

    def forward(self, F_ue, Mesg_ue, Mesg_bs, E, P, Noise):
        '''
        :param F_ue: of size [2MN,K], randomly generated at the first iteration
        :param Mesg_ue: of size[K,M,2MN], randomly generated at the first iteration
        :param E: of size [M,K,2MN]
        :param P: of size [M,1]
        :param Noise: of size [K,1]
        :return: F_ue_update, agg_ue
        '''
        M = P.size()
        M = M[0]
        K = Noise.size()
        K = K[0]
        N = E.size(dim=2)//2
        agg_ue = torch.mean(Mesg_ue, dim=0) # of size [M,2MN]
        print("first F_ue0", F_ue.grad_fn)
        print("first Mesg_ue0", Mesg_ue.grad_fn)
        print("first Mesg_bs0", Mesg_bs.grad_fn)
        #Mesg_bs = Mesg_ue.reshape(M,K,2*M*N)
        for m in range(M):
            for k in range(K):
                Mesg_bs[m,k,:] = self.Message_bs(E[m,k,:], agg_ue[m,:], P[m], Noise[k])
        print("Mesg_bs", Mesg_bs.grad_fn)

        agg_bs = torch.mean(Mesg_bs, dim=0) # of size [K,2MN]
        #Mesg_ue = Mesg_bs.reshape(K,M,2*M*N)
        for k in range(K):
            for m in range(M):                
                Mesg_ue[k,m,:] = self.Message_ue(F_ue[:,k], E[m,k,:], agg_bs[k,:], P[m], Noise[k])
        print("Mesg_ue", Mesg_ue.grad_fn)

        for k in range(K):
            E_cat = torch.cat([E[m, k, :] for m in range(M)], dim=0)
            F_ue[:,k] = self.Update(E_cat, F_ue[:,k], agg_bs[k,:], P[0], Noise[k])
        print("F_ue", F_ue.grad_fn)

        return F_ue, Mesg_ue