How to implement the bacward function for custom autograd.Function?

Now, I’m revising this code
below is

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
import os
import torch
import argparse
import data
import util
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

from models import nin
from torch.autograd import Variable

def save_state(model, best_acc):
    print('==> Saving model ...')
    state = {
            'best_acc': best_acc,
            'state_dict': model.state_dict(),
    for key in state['state_dict'].keys():
        if 'module' in key:
            state['state_dict'][key.replace('module.', '')] = \
                    state['state_dict'].pop(key), 'models/nin.pth.tar')

def train(epoch):
    for batch_idx, (data, target) in enumerate(trainloader):
        # process the weights including binarization
        # forwarding
        data, target = Variable(data.cuda()), Variable(target.cuda())
        output = model(data)
        # backwarding
        loss = criterion(output, target)
        # restore weights
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tLR: {}'.format(
                epoch, batch_idx * len(data), len(trainloader.dataset),
                100. * batch_idx / len(trainloader),,

def test():
    global best_acc
    test_loss = 0
    correct = 0
    for data, target in testloader:
        data, target = Variable(data.cuda()), Variable(target.cuda())
        output = model(data)
        test_loss += criterion(output, target).data.item()
        pred =, keepdim=True)[1]
        correct += pred.eq(
    acc = 100. * float(correct) / len(testloader.dataset)
    if acc > best_acc:
        best_acc = acc
        save_state(model, best_acc)
    test_loss /= len(testloader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)'.format(
        test_loss * 128., correct, len(testloader.dataset),
        100. * float(correct) / len(testloader.dataset)))
    print('Best Accuracy: {:.2f}%\n'.format(best_acc))

def adjust_learning_rate(optimizer, epoch):
    update_list = [120, 200, 240, 280]
    if epoch in update_list:
        for param_group in optimizer.param_groups:
            param_group['lr'] = param_group['lr'] * 0.1

if __name__=='__main__':
    # prepare the options
    parser = argparse.ArgumentParser()
    parser.add_argument('--cpu', action='store_true',
            help='set if only CPU is available')
    parser.add_argument('--data', action='store', default='./data/',
            help='dataset path')
    parser.add_argument('--arch', action='store', default='nin',
            help='the architecture for the network: nin')
    parser.add_argument('--lr', action='store', default='0.01',
            help='the intial learning rate')
    parser.add_argument('--pretrained', action='store', default=None,
            help='the path to the pretrained model')
    parser.add_argument('--evaluate', action='store_true',
            help='evaluate the model')
    args = parser.parse_args()
    print('==> Options:',args)

    # set the seed
    # prepare the data
    if not os.path.isfile('/train_data'):
        # check the data path
        raise Exception\
                ('Please assign the correct data path with --data <DATA_PATH>')

    trainset = data.dataset(, train=True)
    trainloader =, batch_size=128,
            shuffle=True, num_workers=2)

    testset = data.dataset(, train=False)
    testloader =, batch_size=100,
            shuffle=False, num_workers=2)
    # Data
    print('==> Preparing data..')
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),

    transform_test = transforms.Compose([
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),

    trainset = torchvision.datasets.CIFAR10(
        root='./data', train=True, download=True, transform=transform_train)
    trainloader =
        trainset, batch_size=512, shuffle=True, num_workers=12)#num_work = # of CPU we use

    testset = torchvision.datasets.CIFAR10(
        root='./data', train=False, download=True, transform=transform_test)
    testloader =
        testset, batch_size=100, shuffle=False, num_workers=12)

    # define classes
    classes = ('plane', 'car', 'bird', 'cat',
            'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

    # define the model
    print('==> building model',args.arch,'...')
    if args.arch == 'nin':
        model = nin.Net()
        raise Exception(args.arch+' is currently not supported')

    # initialize the model
    if not args.pretrained:
        print('==> Initializing model parameters ...')
        best_acc = 0
        for m in model.modules():
            if isinstance(m, nn.Conv2d):
      , 0.05)
        print('==> Load pretrained model form', args.pretrained, '...')
        pretrained_model = torch.load(args.pretrained)
        best_acc = pretrained_model['best_acc']

    if not args.cpu:
        model = torch.nn.DataParallel(model, device_ids=range(torch.cuda.device_count()))

    # define solver and criterion
    base_lr = float(
    param_dict = dict(model.named_parameters())
    params = []

    for key, value in param_dict.items():
        params += [{'params':[value], 'lr': base_lr,

    optimizer = optim.Adam(params, lr=0.10,weight_decay=0.00001)
    criterion = nn.CrossEntropyLoss()

    # define the binarization operator
    bin_op = util.BinOp(model)

    # do the evaluation if specified
    if args.evaluate:

    # start training
    for epoch in range(1, 320):
        adjust_learning_rate(optimizer, epoch)

and here is (revised)

import torch.nn as nn
import torch
import torch.nn.functional as F

class BinActive(torch.autograd.Function):
    def forward(self, input):
        size = input.size()
        mean = torch.mean(input.abs(), 1, keepdim=True)
        input = input.sign()
        return input, mean
    def backward(self, grad_output, grad_output_mean):
        input, = self.saved_tensors
        grad_input = grad_output.clone()
        grad_input[] = 0
        grad_input[] = 0
        return grad_input

binactive = BinActive.apply
def BinActive(input):
    size = input.size()
    mean = torch.mean(input.abs(), 1, keepdim=True)
    input = input.sign()
    return input, mean
class BinConv2d(nn.Module):
    def __init__(self, input_channels, output_channels,
            kernel_size=-1, stride=-1, padding=-1, dropout=0):
        super(BinConv2d, self).__init__()
        self.layer_type = 'BinConv2d'
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.dropout_ratio = dropout = nn.BatchNorm2d(input_channels, eps=1e-4, momentum=0.1, affine=True) =
        if dropout!=0:
            self.dropout = nn.Dropout(dropout)
        self.conv = nn.Conv2d(input_channels, output_channels,
                kernel_size=kernel_size, stride=stride, padding=padding)
        self.relu = nn.ReLU(inplace=True)
    def forward(self, x):
        x =
        x, mean = binactive(x)
        if self.dropout_ratio!=0:
            x = self.dropout(x)
        x = self.conv(x)
        x = self.relu(x)
        return x

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.xnor = nn.Sequential(
                nn.Conv2d(3, 192, kernel_size=5, stride=1, padding=2),
                nn.BatchNorm2d(192, eps=1e-4, momentum=0.1, affine=False),
                BinConv2d(192, 160, kernel_size=1, stride=1, padding=0),
                BinConv2d(160,  96, kernel_size=1, stride=1, padding=0),
                nn.MaxPool2d(kernel_size=3, stride=2, padding=1),

                BinConv2d( 96, 192, kernel_size=5, stride=1, padding=2, dropout=0.5),
                BinConv2d(192, 192, kernel_size=1, stride=1, padding=0),
                BinConv2d(192, 192, kernel_size=1, stride=1, padding=0),
                nn.AvgPool2d(kernel_size=3, stride=2, padding=1),

                BinConv2d(192, 192, kernel_size=3, stride=1, padding=1, dropout=0.5),
                BinConv2d(192, 192, kernel_size=1, stride=1, padding=0),
                nn.BatchNorm2d(192, eps=1e-4, momentum=0.1, affine=False),
                nn.Conv2d(192,  10, kernel_size=1, stride=1, padding=0),
                nn.AvgPool2d(kernel_size=8, stride=1, padding=0),

    def forward(self, x):
        for m in self.modules():
            if isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d):
                if hasattr(m.weight, 'data'):
        x = self.xnor(x)
        x = x.view(x.size(0), 10)
        return x

this is an error code

Traceback (most recent call last):
  File "", line 205, in <module>
  File "", line 44, in train
  File "/home/mel/.local/lib/python3.7/site-packages/torch/", line 221, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph)
  File "/home/mel/.local/lib/python3.7/site-packages/torch/autograd/", line 132, in backward
    allow_unreachable=True)  # allow_unreachable flag
  File "/home/mel/.local/lib/python3.7/site-packages/torch/autograd/", line 89, in apply
    return self._forward_cls.backward(self, *args)  # type: ignore
  File "/home/mel/.local/lib/python3.7/site-packages/torch/autograd/", line 201, in backward
    raise NotImplementedError("You must implement the backward function for custom"
NotImplementedError: You must implement the backward function for custom autograd.Function.

I’ve been googled for 3 hours, but can’t find a solution.
Is it correct to add Binactiv'backward to the loss function?

you need @staticmethod before forward and backward, i.e. these custom functions are not used with
self : Function (instance), instead special context object is passed as first argument, basically look at example in docs

Thanks for your reply.

I solved this error!