My problem is that after transposing tensor two times its gradient disappears.
Since the model’s weight matrix is large, I performed matrix multiplication as output = weight.mm(input.t()).t()
instead of output=input.mm(weight.t())
However, it makes weight
's gradient to disappear.
Using output=input.mm(weight.t())
makes the model works fine.
Could you find out what is wrong?
Below is my code
import numpy as np
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.optim as optim
device_cpu=torch.device("cpu")
device_cuda_list=[torch.device("cuda:{}".format(i)) for i in range(6)]
import math
import torch
import torch.nn as nn
class CustomizedLinearFunction(torch.autograd.Function):
"""
autograd function which masks it's weights by 'mask'.
"""
# Note that both forward and backward are @staticmethods
@staticmethod
# bias, mask is an optional argument
def forward(ctx, input, weight, bias=None, mask=None):
if mask is not None:
# change weight to 0 where mask == 0
weight.mul_(mask)
output = weight.mm(input.t()).t()
#output=input.mm(weight.t())
if bias is not None:
output += bias.unsqueeze(0).expand_as(output)
ctx.save_for_backward(input, weight, bias, mask)
return output
# This function has only a single output, so it gets only one gradient
@staticmethod
def backward(ctx, grad_output):
input, weight, bias, mask = ctx.saved_tensors
grad_input = grad_weight = grad_bias = grad_mask = None
if ctx.needs_input_grad[0]:
grad_input = grad_output.mm(weight)
if ctx.needs_input_grad[1]:
grad_weight = grad_output.t().mm(input)
if mask is not None:
# change grad_weight to 0 where mask == 0
#grad_weight=grad_weight*mask
grad_weight.mul_(mask) #*
#if bias is not None and ctx.needs_input_grad[2]:
if ctx.needs_input_grad[2]:
grad_bias = grad_output.sum(0).squeeze(0)
return grad_input, grad_weight, grad_bias, grad_mask
class CustomizedLinear(nn.Module):
def __init__(self, mask, bias=True):
super(CustomizedLinear, self).__init__()
self.input_features = mask.shape[1]
self.output_features = mask.shape[0]
if isinstance(mask, torch.Tensor):
self.mask = mask.type(torch.float)
else:
self.mask = torch.tensor(mask, dtype=torch.float)
self.mask = nn.Parameter(self.mask, requires_grad=False)
self.weight = nn.Parameter(torch.Tensor(self.output_features, self.input_features))
if bias:
self.bias = nn.Parameter(torch.Tensor(self.output_features))
else:
# You should always register all possible parameters, but the
# optional ones can be None if you want.
self.register_parameter('bias', None)
self.reset_parameters()
# mask weight
self.weight.dat = self.weight.data.mul_(self.mask)
#self.weight.mul_(self.mask)
def reset_parameters(self):
init.kaiming_uniform_(self.weight, a=math.sqrt(5))
if self.bias is not None:
fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
bound = 1 / math.sqrt(fan_in)
init.uniform_(self.bias, -bound, bound)
def forward(self, input):
# See the autograd section for explanation of what happens here.
return CustomizedLinearFunction.apply(input, self.weight, self.bias, self.mask)
def extra_repr(self):
# (Optional)Set the extra information about this module. You can test
# it by printing an object of this class.
return 'input_features={}, output_features={}, bias={}'.format(
self.input_features, self.output_features, self.bias is not None
)
adjacency=(np.random.rand(100,50)>0.5).astype(int)
adjacency
model=nn.Sequential(CustomizedLinear(mask=adjacency,bias=True)
,nn.ELU()
).to(device_cuda_list[-2])
#optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9,weight_decay=5e-07)
optimizer = optim.Adagrad(model.parameters(),weight_decay=5e-07)
#scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5,verbose=1)
loss_function = nn.BCEWithLogitsLoss()
target_model=model[0]
print(target_model.weight,target_model.weight.shape)
print(target_model.weight.grad)
print(target_model.bias,target_model.bias.shape)
print(target_model.bias.grad)
#sample=torch.rand(19000).to(device_cuda_list[0])
input_sample=torch.rand([2,50]).to(device_cuda_list[-2])
out_sample=model(input_sample)
#out_sample
loss=loss_function(out_sample,torch.zeros([2,100]).to(device_cuda_list[-2]))
loss.backward()
optimizer.step()
print(target_model.weight,target_model.weight.shape)
print(target_model.weight.grad,target_model.weight.grad.shape)
print(target_model.bias,target_model.bias.shape)
print(target_model.bias.grad,target_model.bias.grad.shape)
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-20-24264289d036> in <module>
----> 1 target_model.weight.grad,target_model.weight.grad.shape
AttributeError: 'NoneType' object has no attribute 'shape'