Transposing tensor two time make its gradient disappear

My problem is that after transposing tensor two times its gradient disappears.

Since the model’s weight matrix is large, I performed matrix multiplication as output = weight.mm(input.t()).t() instead of output=input.mm(weight.t())

However, it makes weight's gradient to disappear.
Using output=input.mm(weight.t()) makes the model works fine.

Could you find out what is wrong?

Below is my code

import numpy as np

import torch
import torch.nn as nn
import torch.nn.init as init

import torch.optim as optim

device_cpu=torch.device("cpu")
device_cuda_list=[torch.device("cuda:{}".format(i)) for i in range(6)]


import math
import torch
import torch.nn as nn

class CustomizedLinearFunction(torch.autograd.Function):
    """
    autograd function which masks it's weights by 'mask'.
    """

    # Note that both forward and backward are @staticmethods
    @staticmethod
    # bias, mask is an optional argument
    def forward(ctx, input, weight, bias=None, mask=None):
        if mask is not None:
            # change weight to 0 where mask == 0
            weight.mul_(mask)
        
        output = weight.mm(input.t()).t()
        #output=input.mm(weight.t())
        if bias is not None:
            output += bias.unsqueeze(0).expand_as(output)
        ctx.save_for_backward(input, weight, bias, mask)
        return output

    # This function has only a single output, so it gets only one gradient
    @staticmethod
    def backward(ctx, grad_output):

        input, weight, bias, mask = ctx.saved_tensors
        grad_input = grad_weight = grad_bias = grad_mask = None

        if ctx.needs_input_grad[0]:
            grad_input = grad_output.mm(weight)
        if ctx.needs_input_grad[1]:
            grad_weight = grad_output.t().mm(input)
            if mask is not None:
                # change grad_weight to 0 where mask == 0
                #grad_weight=grad_weight*mask
                grad_weight.mul_(mask) #*
        #if bias is not None and ctx.needs_input_grad[2]:
        if ctx.needs_input_grad[2]:
            grad_bias = grad_output.sum(0).squeeze(0)

        return grad_input, grad_weight, grad_bias, grad_mask


class CustomizedLinear(nn.Module):
    def __init__(self, mask, bias=True):
        super(CustomizedLinear, self).__init__()
        self.input_features = mask.shape[1]
        self.output_features = mask.shape[0]
        if isinstance(mask, torch.Tensor):
            self.mask = mask.type(torch.float)
        else:
            self.mask = torch.tensor(mask, dtype=torch.float)

        self.mask = nn.Parameter(self.mask, requires_grad=False)
        
        self.weight = nn.Parameter(torch.Tensor(self.output_features, self.input_features))
        
        if bias:
            self.bias = nn.Parameter(torch.Tensor(self.output_features))
        else:
            # You should always register all possible parameters, but the
            # optional ones can be None if you want.
            self.register_parameter('bias', None)
        self.reset_parameters()

        # mask weight
        self.weight.dat = self.weight.data.mul_(self.mask)
        #self.weight.mul_(self.mask)

    def reset_parameters(self):

        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in)
            init.uniform_(self.bias, -bound, bound)


    def forward(self, input):
        # See the autograd section for explanation of what happens here.
        return CustomizedLinearFunction.apply(input, self.weight, self.bias, self.mask)

    def extra_repr(self):
        # (Optional)Set the extra information about this module. You can test
        # it by printing an object of this class.
        return 'input_features={}, output_features={}, bias={}'.format(
            self.input_features, self.output_features, self.bias is not None
        )

adjacency=(np.random.rand(100,50)>0.5).astype(int)
adjacency

model=nn.Sequential(CustomizedLinear(mask=adjacency,bias=True)
                    ,nn.ELU()
                    ).to(device_cuda_list[-2])

#optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9,weight_decay=5e-07)
optimizer = optim.Adagrad(model.parameters(),weight_decay=5e-07)
#scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5,verbose=1)
loss_function = nn.BCEWithLogitsLoss()

target_model=model[0]

print(target_model.weight,target_model.weight.shape)

print(target_model.weight.grad)

print(target_model.bias,target_model.bias.shape)

print(target_model.bias.grad)

#sample=torch.rand(19000).to(device_cuda_list[0])
input_sample=torch.rand([2,50]).to(device_cuda_list[-2])

out_sample=model(input_sample)
#out_sample

loss=loss_function(out_sample,torch.zeros([2,100]).to(device_cuda_list[-2]))

loss.backward()

optimizer.step()

print(target_model.weight,target_model.weight.shape)

print(target_model.weight.grad,target_model.weight.grad.shape)

print(target_model.bias,target_model.bias.shape)

print(target_model.bias.grad,target_model.bias.grad.shape)
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-20-24264289d036> in <module>
----> 1 target_model.weight.grad,target_model.weight.grad.shape

AttributeError: 'NoneType' object has no attribute 'shape'

This is really weird. I played with your code a bit and I got it to not error out by doing either of two different things:

  • Firstly, by doing the final transpose in-place:
output = weight.mm(input.t()).t_()
  • Secondly, by putting a clone() after the end of the final transpose:
output = weight.mm(input.t()).t().clone()

I do not know why either of these things work. Perhaps someone with more experience with autograd could chime in here?