Runtime error :Legacy autograd function with non-static forward method is deprecated. Please use new-style autograd function with static forward method

In this python code


import numpy as np
import scipy.stats as st
import operator
from functools import reduce

import torch
import torch.nn as nn
from torch.autograd import Variable, Function
from torch.nn.parameter import Parameter
import torch.optim as optim
import torch.cuda

import qpth
from qpth.qp import QPFunction

import ipdb

class Net(nn.Module):
    def __init__(self, X, Y, hidden_layer_sizes):
        super(Net, self).__init__()

        # Initialize linear layer with least squares solution
        X_ = np.hstack([X, np.ones((X.shape[0],1))])
        Theta = np.linalg.solve(X_.T.dot(X_), X_.T.dot(Y))
        
        self.lin = nn.Linear(X.shape[1], Y.shape[1])
        W,b = self.lin.parameters()
        W.data = torch.Tensor(Theta[:-1,:].T)
        b.data = torch.Tensor(Theta[-1,:])
        
        # Set up non-linear network of 
        # Linear -> BatchNorm -> ReLU -> Dropout layers
        layer_sizes = [X.shape[1]] + hidden_layer_sizes
        layers = reduce(operator.add, 
            [[nn.Linear(a,b), nn.BatchNorm1d(b), nn.ReLU(), nn.Dropout(p=0.2)] 
                for a,b in zip(layer_sizes[0:-1], layer_sizes[1:])])
        layers += [nn.Linear(layer_sizes[-1], Y.shape[1])]
        self.net = nn.Sequential(*layers)
        self.sig = Parameter(torch.ones(1, Y.shape[1]).cuda())
        
    def forward(self, x):
        return self.lin(x) + self.net(x), \
            self.sig.expand(x.size(0), self.sig.size(1))
    
    def set_sig(self, X, Y):
        Y_pred = self.lin(X) + self.net(X)
        var = torch.mean((Y_pred-Y)**2, 0)
        self.sig.data = torch.sqrt(var).cuda().data.unsqueeze(0)


class GLinearApprox(Function):
    """ Linear (gradient) approximation of G function at z"""
    def __init__(self, gamma_under, gamma_over):
        self.gamma_under = gamma_under
        self.gamma_over = gamma_over
   

    def forward(self, z, mu, sig):
        self.save_for_backward(z, mu, sig)
        p = st.norm(mu.cpu().numpy(),sig.cpu().numpy())
        return torch.DoubleTensor((self.gamma_under + self.gamma_over) * p.cdf(
            z.cpu().numpy()) - self.gamma_under).cuda()
    
    @staticmethod
    def backward(self, grad_output):
        z, mu, sig = self.saved_tensors
        p = st.norm(mu.cpu().numpy(),sig.cpu().numpy())
        pz = torch.DoubleTensor(p.pdf(z.cpu().numpy())).cuda()
        
        dz = (self.gamma_under + self.gamma_over) * pz
        dmu = -dz
        dsig = -(self.gamma_under + self.gamma_over)*(z-mu) / sig * pz
        return grad_output * dz, grad_output * dmu, grad_output * dsig


class GQuadraticApprox(Function):
    """ Quadratic (gradient) approximation of G function at z"""
    def __init__(self, gamma_under, gamma_over):
        self.gamma_under = gamma_under
        self.gamma_over = gamma_over
    
    def forward(self, z, mu, sig):
        self.save_for_backward(z, mu, sig)
        p = st.norm(mu.cpu().numpy(),sig.cpu().numpy())
        return torch.DoubleTensor((self.gamma_under + self.gamma_over) * p.pdf(
            z.cpu().numpy())).cuda()
    
    def backward(self, grad_output):
        z, mu, sig = self.saved_tensors
        p = st.norm(mu.cpu().numpy(),sig.cpu().numpy())
        pz = torch.DoubleTensor(p.pdf(z.cpu().numpy())).cuda()
        
        dz = -(self.gamma_under + self.gamma_over) * (z-mu) / (sig**2) * pz
        dmu = -dz
        dsig = (self.gamma_under + self.gamma_over) * ((z-mu)**2 - sig**2) / \
            (sig**3) * pz
        
        return grad_output * dz, grad_output * dmu, grad_output * dsig


class SolveSchedulingQP(nn.Module):
    """ Solve a single SQP iteration of the scheduling problem"""
    def __init__(self, params):
        super(SolveSchedulingQP, self).__init__()
        self.c_ramp = params["c_ramp"]
        self.n = params["n"]
        D = np.eye(self.n - 1, self.n) - np.eye(self.n - 1, self.n, 1)
        self.G = Variable(torch.DoubleTensor(np.vstack([D,-D])).cuda())
        self.h = Variable((self.c_ramp * torch.ones((self.n - 1) * 2))\
            .double().cuda())
        self.e = Variable(torch.Tensor().double().cuda())
        
    def forward(self, z0, mu, dg, d2g):
        nBatch, n = z0.size()
        
        Q = torch.cat([torch.diag(d2g[i] + 1).unsqueeze(0) 
            for i in range(nBatch)], 0).double()
        p = (dg - d2g*z0 - mu).double()
        G = self.G.unsqueeze(0).expand(nBatch, self.G.size(0), self.G.size(1))
        h = self.h.unsqueeze(0).expand(nBatch, self.h.size(0))
        
        out = QPFunction(verbose=False)(Q, p, G, h, self.e, self.e)
        return out


class SolveScheduling(nn.Module):
    """ Solve the entire scheduling problem, using sequential quadratic 
        programming. """
    def __init__(self, params):
        super(SolveScheduling, self).__init__()
        self.params = params
        self.c_ramp = params["c_ramp"]
        self.n = params["n"]
        
        D = np.eye(self.n - 1, self.n) - np.eye(self.n - 1, self.n, 1)
        self.G = Variable(torch.DoubleTensor(np.vstack([D, -D])).cuda())
        self.h = Variable((self.c_ramp * torch.ones((self.n - 1) * 2))\
            .double().cuda())
        self.e = Variable(torch.Tensor().double().cuda())
        
    def forward(self, mu, sig):
        nBatch, n = mu.size()
        
        # Find the solution via sequential quadratic programming, 
        # not preserving gradients
        z0 = Variable(1. * mu.data, requires_grad=False)
        mu0 = Variable(1. * mu.data, requires_grad=False)
        sig0 = Variable(1. * sig.data, requires_grad=False)
        for i in range(20):
            dg = GLinearApprox(self.params["gamma_under"],
                 self.params["gamma_over"])(z0, mu0, sig0)
            d2g = GQuadraticApprox(self.params["gamma_under"], 
                  self.params["gamma_over"])(z0, mu0, sig0)
            z0_new = SolveSchedulingQP(self.params)(z0, mu0, dg, d2g)
            solution_diff = (z0-z0_new).norm().data[0]
            print("+ SQP Iter: {}, Solution diff = {}".format(i, solution_diff))
            z0 = z0_new
            if solution_diff < 1e-10:
                break
                  
        # Now that we found the solution, compute the gradient-propagating 
        # version at the solution
        dg = GLinearApprox(self.params["gamma_under"], 
            self.params["gamma_over"])(z0, mu, sig)
        d2g = GQuadraticApprox(self.params["gamma_under"], 
            self.params["gamma_over"])(z0, mu, sig)
        return SolveSchedulingQP(self.params)(z0, mu, dg, d2g)
'''
I am getting error as 
` /content/e2e-model-learning/power_sched/model_classes.py in forward(self=SolveScheduling(), mu=tensor([[1.6614, 1.5842, 1.5505,  ..., 2.1037, 2...0', dtype=torch.float64, grad_fn=<CopyBackwards>), sig=tensor([[0.0194, 0.0241, 0.0263,  ..., 0.0823, 0...0', dtype=torch.float64, grad_fn=<CopyBackwards>))
    151         for i in range(20):
    152             dg = GLinearApprox(self.params["gamma_under"],
--> 153                  self.params["gamma_over"])(z0, mu0, sig0)
        self.params = {'n': 24, 'c_ramp': 0.4, 'gamma_under': 50, 'gamma_over': 0.5}
        z0 = tensor([[1.6614, 1.5842, 1.5505,  ..., 2.1037, 2.0368, 1.9119],
        [1.7478, 1.6636, 1.6318,  ..., 2.0573, 1.9935, 1.8867],
        [1.7980, 1.7113, 1.6673,  ..., 1.9362, 1.8971, 1.7941],
        ...,
        [1.4203, 1.3474, 1.3094,  ..., 1.6379, 1.5986, 1.5301],
        [1.3920, 1.3247, 1.2893,  ..., 1.6337, 1.5924, 1.5241],
        [1.5190, 1.4434, 1.4108,  ..., 1.8642, 1.8024, 1.6948]],
       device='cuda:0', dtype=torch.float64)
        mu0 = tensor([[1.6614, 1.5842, 1.5505,  ..., 2.1037, 2.0368, 1.9119],
        [1.7478, 1.6636, 1.6318,  ..., 2.0573, 1.9935, 1.8867],
        [1.7980, 1.7113, 1.6673,  ..., 1.9362, 1.8971, 1.7941],
        ...,
        [1.4203, 1.3474, 1.3094,  ..., 1.6379, 1.5986, 1.5301],
        [1.3920, 1.3247, 1.2893,  ..., 1.6337, 1.5924, 1.5241],
        [1.5190, 1.4434, 1.4108,  ..., 1.8642, 1.8024, 1.6948]],
       device='cuda:0', dtype=torch.float64)
        sig0 = tensor([[0.0194, 0.0241, 0.0263,  ..., 0.0823, 0.0799, 0.0746],
        [0.0194, 0.0241, 0.0263,  ..., 0.0823, 0.0799, 0.0746],
        [0.0194, 0.0241, 0.0263,  ..., 0.0823, 0.0799, 0.0746],
        ...,
        [0.0194, 0.0241, 0.0263,  ..., 0.0823, 0.0799, 0.0746],
        [0.0194, 0.0241, 0.0263,  ..., 0.0823, 0.0799, 0.0746],
        [0.0194, 0.0241, 0.0263,  ..., 0.0823, 0.0799, 0.0746]],
       device='cuda:0', dtype=torch.float64)
    154             d2g = GQuadraticApprox(self.params["gamma_under"], 
    155                   self.params["gamma_over"])(z0, mu0, sig0)

/usr/local/lib/python3.6/dist-packages/torch/autograd/function.py in __call__(self=<model_classes.GLinearApprox object>, *args=(tensor([[1.6614, 1.5842, 1.5505,  ..., 2.1037, 2...8]],
       device='cuda:0', dtype=torch.float64), tensor([[1.6614, 1.5842, 1.5505,  ..., 2.1037, 2...8]],
       device='cuda:0', dtype=torch.float64), tensor([[0.0194, 0.0241, 0.0263,  ..., 0.0823, 0...6]],
       device='cuda:0', dtype=torch.float64)), **kwargs={})
    147     def __call__(self, *args, **kwargs):
    148         raise RuntimeError(
--> 149             "Legacy autograd function with non-static forward method is deprecated. "
    150             "Please use new-style autograd function with static forward method. "
    151             "(Example: https://pytorch.org/docs/stable/autograd.html#torch.autograd.Function)")

RuntimeError: Legacy autograd function with non-static forward method is deprecated. Please use new-style autograd function with static forward method. (Example: https://pytorch.org/docs/stable/autograd.html#torch.autograd.Function)
> /usr/local/lib/python3.6/dist-packages/torch/autograd/function.py(149)__call__()
    147     def __call__(self, *args, **kwargs):
    148         raise RuntimeError(
--> 149             "Legacy autograd function with non-static forward method is deprecated. "
    150             "Please use new-style autograd function with static forward method. "
    151             "(Example: https://pytorch.org/docs/stable/autograd.html#torch.autograd.Function)")

ipdb> 
'

For the newer versions of pytorch, any torch.autograd.Function needs to be implemented with static member functions. Here is a pretty comprehensive example, n.b. the extra custom_[f|b]wd decorators are for amp and won’t be required if you aren’t using it.

class CorrelationFunction(torch.autograd.Function):
    """
    Typical Parameters: pad_size=3, kernel_size=3, max_displacement=20,
    stride1=1, stride2=2, corr_multiply=1
    """

    @staticmethod
    @torch.cuda.amp.custom_fwd
    def forward(ctx, input1, input2, pad_size=3, kernel_size=3,
                max_displacement=20, stride1=1, stride2=2, corr_multiply=1):
        ctx.save_for_backward(input1, input2)

        ctx.pad_size = pad_size
        ctx.kernel_size = kernel_size
        ctx.max_displacement = max_displacement
        ctx.stride1 = stride1
        ctx.stride2 = stride2
        ctx.corr_multiply = corr_multiply

        output = torch.ops.cerberus.correlation(
            input1, input2, pad_size, kernel_size,
            max_displacement, stride1, stride2, corr_multiply)

        return output

    @staticmethod
    @torch.cuda.amp.custom_bwd
    def backward(ctx, grad_outputs):
        input1, input2 = ctx.saved_tensors

        grad_input1, grad_input2 = torch.ops.cerberus.correlation_backward(
            input1, input2, grad_outputs, ctx.pad_size, ctx.kernel_size,
            ctx.max_displacement, ctx.stride1, ctx.stride2, ctx.corr_multiply)

        return grad_input1, grad_input2, None, None, None, None, None, None

class Correlation(torch.nn.Module):
    def __init__(self, pad_size=0, kernel_size=0, max_displacement=0,
                 stride1=1, stride2=2, corr_multiply=1):
        super(Correlation, self).__init__()

        self.pad_size = pad_size
        self.kernel_size = kernel_size
        self.max_displacement = max_displacement
        self.stride1 = stride1
        self.stride2 = stride2
        self.corr_multiply = corr_multiply

    def forward(self, input1, input2):
        return CorrelationFunction.apply(
            input1, input2, self.pad_size, self.kernel_size,
            self.max_displacement, self.stride1, self.stride2, self.corr_multiply)

1 Like