Backward through topk operation on Variable

Suriya · October 28, 2017, 7:01pm

Hi, Pytorch beginner here.

Does PyTorch supports backward through topk operation on Variables?

    features = get_feature(x)          ### x is (B, 3, 128, 128)
    
    score = regressor(features)        ### score here is (B, 1, 128, 128)
    size_ = score.size()
    score = score.view(-1)

    topk, indices = torch.topk(score, score.size()[0]/2)

    mask = Variable(torch.zeros(score.size()).cuda(), requires_grad=True)

    mask = mask.scatter(0, indices, 1)          ### thresholding
    
    mask = mask.view(size_)            ### mask here is (B, 1, 128, 128)

upon loss.backward(), I get mask.grad as

    Variable containing:
    ( 0 , 0 ,.,.) = 
      1.4381e-07  4.3006e-07  1.3524e-08  ...   9.8986e-08  1.8383e-07  3.0903e-07
      4.8662e-07  6.9170e-07  3.4237e-08  ...  -1.7501e-08  1.2937e-08  1.4631e-07
      4.7671e-07  3.3662e-07  4.4110e-08  ...  -2.6617e-08  6.9062e-08  1.2557e-07
                     ...                   ⋱ 
      [torch.cuda.FloatTensor of size 64x1x128x128 (GPU 0)]

But

for f in net.parameters():
        print f.grad

gives,

Variable containing:
(0 ,0 ,.,.) = 
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
     ...       ⋱       ...    
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0

for all parameters.

Am I doing something wrong?
Is there a better way to do this?
Thanks in advance.

SimonW · October 28, 2017, 8:03pm

PyTorch definitely is able to backward through topk. Could you please post a full script with how loss is calculated and how the topk variable is used? Thanks

Suriya · October 28, 2017, 8:58pm

Here is the script. net_A is encoder-decoder networks, its forward() computes a binary mask (code in question) and net_B is autoencoder trained for reconstruction.

net_A.train()
net_B.eval()
optimizer_A.zero_grad()
for batch_idx, inputs in enumerate(train_loader):
    
    if use_cuda:
        inputs = inputs.cuda()
    
    inputs_a = Variable(inputs)
    
    mask = net_A(inputs_a)                       ### takes image as input and outputs mask of same resolution
    mask.retain_grad()
    inputs_b = torch.mul(inputs_a, mask)
    outputs_b = net_B(inputs_b)                ### takes corrupted image as input and outputs reconstruction of same resolution
    
    targets_b = inputs_a.clone()
    targets_b[:,0,:,:] = torch.mul(targets_b[:,0,:,:], 1.0/(3.0*std_bgr[0]))
    targets_b[:,1,:,:] = torch.mul(targets_b[:,1,:,:], 1.0/(3.0*std_bgr[1]))
    targets_b[:,2,:,:] = torch.mul(targets_b[:,2,:,:], 1.0/(3.0*std_bgr[2]))

    loss = 1 - torch.sum((outputs_b*(1-mask) - targets_b*(1-mask))**2)/outputs_b.nelement()
    
    for f in net_A.parameters():
        f.retain_grad()

    loss.backward()
    
    print mask.grad                          ### gives gradient values
    for f in net_A.parameters():
        print f.grad                         ### gives zeros for all params

    optimizer_A.step()
    optimizer_A.zero_grad()




class net_A(nn.Module):
   def __init__(self, block, layers):
        super(net_A, self).__init__()
        #### Resnet 
        self.inplanes = 64
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,  bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        #### Resnet 

        #### Deconvs
        self.deconv1 = nn.ConvTranspose2d(512*block.expansion, 512, kernel_size=4, stride=2, padding=1, bias=False)
        self.bn_d1 = nn.BatchNorm2d(512)
        self.deconv2 = nn.ConvTranspose2d(512, 256, kernel_size=4, stride=2, padding=1, bias=False)
        self.bn_d2 = nn.BatchNorm2d(256)
        self.deconv3 = nn.ConvTranspose2d(256, 128, kernel_size=4, stride=2, padding=1, bias=False)
        self.bn_d3 = nn.BatchNorm2d(128)
        self.deconv4 = nn.ConvTranspose2d(128, 64, kernel_size=4, stride=2, padding=1, bias=False)
        self.bn_d4 = nn.BatchNorm2d(64)
        self.deconv5 = nn.ConvTranspose2d(64, 32, kernel_size=4, stride=2, padding=1, bias=False)
        self.bn_d5 = nn.BatchNorm2d(32)
        #### Deconvs

        self.regressor = nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0, bias=True)



        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

   def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),        )

       layers = []
       layers.append(block(self.inplanes, planes, stride, downsample))
       self.inplanes = planes * block.expansion
       for i in range(1, blocks):
           layers.append(block(self.inplanes, planes))

       return nn.Sequential(*layers)

   def get_feature(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.deconv1(x)
        x = self.bn_d1(x)
        x = self.relu(x)
    
        x = self.deconv2(x)
        x = self.bn_d2(x)
        x = self.relu(x)
    
        x = self.deconv3(x)
        x = self.bn_d3(x)
        x = self.relu(x)
    
        x = self.deconv4(x)
        x = self.bn_d4(x)
        x = self.relu(x)
    
        x = self.deconv5(x)
        x = self.bn_d5(x)
        x = self.relu(x)

        return x

    
   def forward(self, x):
        features = self.get_feature(x)          ### x is (B, 3, 128, 128)

        score = self.regressor(features)        ### score here is (B, 1, 128, 128)
        size_ = score.size()
        score = score.view(-1)

        topk, indices = torch.topk(score, score.size()[0]/2)

        mask = Variable(torch.zeros(score.size()).cuda(), requires_grad=True)

        mask = mask.scatter(0, indices, 1)          ### thresholding

        mask = mask.view(size_)            ### mask here is (B, 1, 128, 128)

        return mask

Sourabh_Daptardar · October 29, 2017, 9:35am

I believe ‘topk’ is differentiable and ‘indices’ is not (in the autograd graph sense).

Digging in autograd code I found the following line related to class Topk(_MultiSelectionFunction):

github.com

pytorch/pytorch/blob/4f33b136d8ba547725022840ee8adf77e1ed245e/torch/autograd/_functions/tensor.py#L537


      
          
          @staticmethod
          def forward(ctx, input, dim, return_indices, args):
              fn = getattr(input, ctx._forward_cls.__name__.lower())
              ctx.return_indices = return_indices
              ctx.input_size = input.size()
              ctx.dim = dim
              output, indices = fn(*args)
              if return_indices:
                  ctx.save_for_backward(indices)
                  ctx.mark_non_differentiable(indices)
                  return output, indices
              else:
                  ctx.indices = indices
                  return output
          
          @staticmethod
          def backward(ctx, grad_output, grad_indices=None):
              grad_input = Variable(grad_output.data.new(ctx.input_size).zero_())
              if ctx.return_indices:
                  indices, = ctx.saved_variables

Perhaps, @apaszke or any one familiar with autograd code, could clarify and confirm. Is there some documentation on this ?

SimonW · July 31, 2019, 3:04pm

The values are differentiable, but the indices aren’t, similar to how argmax is just not a differentiable function.

pinocchio · February 11, 2020, 9:32pm

@SimonW Can you clarify how topk is differentiable? How is it different from a generic k-argmax?

SimonW · February 12, 2020, 5:22pm

As I said above, the values are, the indices are not.