How to reduce autograd memory usage?

My program’s memory usage is roughly an order of magnitude greater when I specify requires_grad=True on the parameters of my model. I’ve looked through the docs to find a way to reduce my program’s memory consumption, but I can’t seem to figure it out. Here is my objective function:

def fun(x, cons, est, trans, model, data):
    print(x)
    for con in cons:
        valid = np.all(con['fun'](x, *con['args']) > 0)
        if valid == False: break
    if valid == False:
        ll = float('nan')
        grad = np.empty(len(x))
        grad[:] = np.nan
        print('Constraint violation')
        print()
        return (-ll, -grad)     
    else:
        torch.manual_seed(7)
        ex0 = (None,)
        ex1 = (slice(None), None)
        x = torch.tensor(x.astype(np.float32), requires_grad=True)
        ll = 0
        for d in range(len(data)):
            N, S, K = (data[d]['N'], data[d]['S'], data[d]['K'])
            p = 0
            mu = torch.tensor([])
            re = []
            theta = {}
            for (key, val) in est.items():
                if type(val) != str: 
                    theta[key] = torch.tensor(val)
                elif val == 'pool':
                    theta[key] = x[p][ex0]
                    p += 1
                elif val == 'rand':
                    mu = torch.cat((mu, x[p][ex0]))
                    p += 1
                    re.append(key)
            R = len(re)
            di = tuple([torch.tensor(ind, dtype=torch.long) 
                        for ind in np.diag_indices(R)])
            tril = tuple([torch.tensor(ind, dtype=torch.long) 
                          for ind in np.tril_indices(R, -1)])
            triu = tuple([torch.tensor(ind, dtype=torch.long)
                          for ind in np.triu_indices(R, 1)]) 
            sigma = torch.empty(R, R)
            sigma[di] = x[p:p+R]
            sigma[tril] = x[p+R:]
            sigma[triu] = sigma.t()[triu]                
            sample = MultivariateNormal(mu, sigma).rsample((N * S,))
            theta.update({re[r]: trans[re[r]](sample[:,r]) for r in range(R)})
            theta['upsilon'] = torch.zeros(N * S)[ex1]
            if K > 1: 
                theta['upsilon'] = torch.cat((theta['upsilon'], 
                                              theta['upsilon2'][ex1]), 1)
            if K > 2: 
                theta['upsilon'] = torch.cat((theta['upsilon'], 
                                              theta['upsilon3'][ex1]), 1)
            theta['upsilon'] = theta['upsilon'] / theta['kappa']
            if data[d]['treat'] == 'pvd': theta['omega'] = torch.zeros(1)
            ll = ll + logL(theta, model, data[d])
        ll = ll / 1e+2
        ll.backward()
        grad = np.array(x.grad.cpu())
        print(ll.item() * 1e+2)
        print()
        return (-ll.item(), -grad)

Here is the logL function referenced in the objective function:

def logL(theta, model, data):
    N, S, T = (data['N'], data['S'], data['T'])
    p_a = torch.ones(N * S)
    model.start(theta, data['mat'])
    for t in range(1, T + 1):
        model.update(data, t)
        p_a = p_a * (npeat(data['obs'][:,:,t-1], (S, 1)) * model.p_a).sum(1)   
    ll = p_a.reshape(N, S).mean(1).log().sum()   
    return ll

And here is my model

class Model:
    
    def __init__(self, opt):
        fun = {'p_k': {'rand': Probability.rand,
                       'hmax': Probability.hmax,
                       'smax': Probability.smax},
               'p_ak': {'rand': Probability.rand,
                        'hmax': Probability.hmax,
                        'smax': Probability.smax},
               'U' : {'none': Utility.Learning.none,
                      'diff': Utility.Learning.diff},
               'v': {'lin': Utility.lin},
               'w': {'lin': Probability.Weighting.lin,
                     'linlog': Probability.Weighting.linlog,
                     'ratio': Probability.Weighting.ratio},
               'b_1': {'rand': Probability.rand,
                      'hmax': Probability.hmax,
                      'smax': Probability.smax}, 
               'b_k': {'rand': Probability.rand,
                      'hmax': Probability.hmax,
                      'smax': Probability.smax}, 
               'B': {'none': Activation.Base.none,
                     'exp': Activation.Base.exp,
                     'pow': Activation.Base.pow},
               'S': {'none': Activation.Associative.none,
                     'chain': Activation.Associative.chain},
               'P': {'none': Activation.Matching.none,
                     'part': Activation.Matching.part},
               'R_a': {'lin': Utility.lin},
               'R_c': {'none': Activation.Base.Learning.none,
                       'belief': Activation.Base.Learning.belief,
                       'reinf': Activation.Base.Learning.reinf,
                       'attrac': Activation.Base.Learning.attrac},
               'S_q': {'none': Activation.Associative.Learning.none,
                       'diff': Activation.Associative.Learning.diff}}
        self.fun = {key: fun[key][val] for (key, val) in opt.items()}
        self.opt = opt
                   
    def start(self, theta, mat):
        ex1 = (slice(None), None)
        ex12 = (slice(None), None, None)
        omega = theta['omega'][ex1].expand(-1, mat.size()[-1])
        W = torch.cat((omega, 1 - omega), 1)
        xi = W * theta['xi'][ex1]
        beta = W * theta['beta'][ex1]
        arg = {'p_k': {'rand': (1,),
                       'hmax': (1,),
                       'smax': (theta['kappa'][ex1], 1)},
               'p_ak': {'rand': (1,),
                        'hmax': (1,),
                        'smax': (theta['lam'][ex12], 1)},
               'U': {'none': (),
                     'diff': (theta['alpha'][ex1],)},
               'v': {'lin': (mat,)},
               'w': {'lin': (),
                     'linlog': (theta['gamma'][ex12], theta['delta'][ex12]),
                     'ratio': (theta['gamma'][ex12], theta['delta'][ex12], 2)},
               'b_1': {'rand': (2,),
                      'hmax': (2,),
                      'smax': (1, 2)},
               'b_k': {'rand': (2,),
                      'hmax': (2,),
                      'smax': (theta['iota'][ex12], 2)},
               'B': {'cons': {},
                     'exp': {'gamma': theta['phi'][ex1]},
                     'pow': {'d': theta['phi'][ex12], 'dim': 2}},
               'S': {'none': (),
                     'chain': (2,)},
               'P': {'none': (),
                     'part': (xi[ex1], 2)},
               'R_a': {'lin': ()},
               'R_c': {'none': {},
                       'belief': {},
                       'reinf': {},
                       'attrac': {'delta': theta['rho'][ex1]}},
               'S_q': {'none': (),
                       'diff': (theta['psi'][ex12], beta[ex1])}}  
        self.arg = {key: arg[key][val] for (key, val) in self.opt.items()}
        self.theta = theta
        self.p_ak = torch.zeros(1)
        self.U = theta['upsilon']
        self.v = self.fun['v'](*self.arg['v'])
        self.B = torch.zeros(1).log()
        self.S = torch.zeros(1)[ex1]
        self.R_ct = torch.zeros(0)
        self.S_q = torch.zeros(1)                 
        
    def update(self, dat, t):
        old = (Ellipsis, t - 1)
        new = (Ellipsis, t)
        ex1 = (slice(None), None)
        ex2 = (slice(None), slice(None), None)
        N, S, A, X, M, C, K, I = (dat['N'], dat['S'], dat['A'], dat['X'], 
                                  dat['M'], dat['C'], dat['K'], dat['I'])
        hypo, hist, act, cue, sim, pay = (dat['hypo'], dat['hist'], dat['act'],
                                          dat['cue'], dat['sim'], dat['pay'])      
        hypo_old = npeat(hypo[old], (S,)+(1,)*(hypo[old].dim()-1))
        hist_old = npeat(hist[old], (S,)+(1,)*(hist[old].dim()-1))
        act_old = npeat(act[old], (S,)+(1,)*(act[old].dim()-1))
        cue_old = npeat(cue[old], (S,)+(1,)*(cue[old].dim()-1))
        cue_new = npeat(cue[new], (S,)+(1,)*(cue[new].dim()-1))
        sim_new = npeat(sim[new], (S,)+(1,)*(sim[new].dim()-1))
        pay_old = npeat(pay[old], (S,)+(1,)*(pay[old].dim()-1))
        fun, arg = (self.fun, self.arg) 
        R_a = fun['R_a'](pay_old, *arg['R_a'])
        R_k = (self.p_ak * R_a[ex2]).sum(1)
        self.U = fun['U'](self.U, R_k, int(t > 1), *arg['U'])
        attn = 1 if t > 1 else self.theta['pi']
        R_c = fun['R_c'](attn, hypo=hypo_old, hist=hist_old, **arg['R_c'])
        self.R_ct = torch.cat((self.R_ct, R_c[ex2]), 2)
        R_ct = self.R_ct[:,:,:t]
        time = torch.arange(float(t), 0, -1)
        self.S_q = fun['S_q'](self.S_q, self.S[ex2], 
                              act_old[ex2], cue_old, *arg['S_q'])
        self.B = fun['B'](B=self.B, t=time, R=R_c, R_j=R_ct, **arg['B'])
        self.S = fun['S'](self.S_q, cue_new, *arg['S'])
        P = fun['P'](sim_new, *arg['P'])
        A_c = (self.B + self.S + P).reshape(N * S, I * M, C // M)
        b_1 = fun['b_1'](A_c, *arg['b_1']).reshape(N * S, I * M, A, X).sum(3)
        w = npeat(fun['w'](b_1, *arg['w']), (1, A // M, 1))
        V_k = (w * self.v).sum(2)[ex2]
        for k in range(1, K):
            own = (slice(None), slice(0, A), k - 1)
            opp = (slice(None), slice(A, 2 * A), k - 1)
            V = torch.cat((V_k[opp], V_k[own]), 1).reshape(N * S, 2, A)
            b_k = npeat(fun['b_k'](V, *arg['b_k']), (1, A, 1))
            V_k = torch.cat((V_k, (b_k * self.v).sum(2)[ex2]), 2)
        self.p_k = fun['p_k'](self.U, *arg['p_k'])
        self.p_ak = fun['p_ak'](V_k[:,:A], *arg['p_ak'])
        self.p_a = (self.p_k[ex1] * self.p_ak).sum(2)

I don’t understand why autograd is consuming so much memory. Please let me know if you see anything I can do to reduce my memory consumption. I’m willing to provide any additional information related to the program if you think it will help diagnose the problem.

Hi,

To be able to compute the backward pass, the autograd needs to save some intermediary results that are required by the backward formula.
Just given the size of your function, I guess there are a lot of these intermediary states that need to be saved, leading to a significant memory increase.

Note that this is expected and if you use a resnet for example, most of the memory usage will come from the autograd states!

You can try to use checkpoint doc to trade memory for compute: It will discard some of these intermediary results and will recompute them during the backward pass.
So if you wrap a collection of ops inside a checkpoint, you won’t have to save the intermediary states for these. But they will be recomputed during the backward which will be slower.

Thank you for your response! I actually tried using checkpoints before making this post, but I don’t think I’m using them properly. I tried putting logL and its arguments inside of checkpoint.checkpoint() only to get an error saying “Element 0 of tensors does not require grad and does not have a grad_fn”. Where should I be putting these checkpoints? Could you give me an example of how I might implement checkpoints in my code? If I could get one working example, I’m sure I could fiddle around with it on my own to figure out the right balance between memory and compute.

A few things to be careful about when using this are:

  • At least one input to checkpoint function must require gradients. You can make your model input requires grad for example if the parameters are “hidden” inside the model.
  • It will allow you to save the memory from the intermediary results inside your function (but not the ones that are inputs or outputs). So to get any benefit, your function must be big enough.
  • It will recompute these intermediary results during the backward and only keep them around while the backward for the checkpoint is computed. So you cannot put all your function inside a single checkpoint otherwise the max memory requirement will be the same as before.

An example is if you use resnet network that contains many blocks of conv + conv + relu + bn. Then you can checkpoint each of these blocks to reduce the memory usage without a big speed penalty.

Ok, let’s talk about a simpler example:

import torch
from torch.utils import checkpoint

def fun(theta, t):
    return theta['alpha']**t + theta['beta']**t

def wrapper(x):
    x = torch.tensor(x, requires_grad=True)
    theta = {}
    theta['alpha'] = x[0]
    theta['beta'] = x[1]
    out = torch.empty(3)
    for t in range(3):
        out[t] = checkpoint.checkpoint(fun, *(theta, t))
    y = out.sum()
    y.backward()
    grad = x.grad
    return (y.item(), grad)

print(wrapper([2., 3.]))

From your response, it sounds like maybe I should make a checkpoint at each iteration of my for loop in logL, so here is a toy example where I am essentially trying to do the same thing in the function wrapper. When I run this code, it returns “RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn”, but theta requires grad here right? What am I doing wrong? Note that if checkpoint.checkpoint() is removed, the function returns the appropriate gradient.

This is because no input to the checkpoint layer is a Tensor that requires_grad. You pass a dictionnary and a python number.

import torch
from torch.utils import checkpoint

def fun(x, theta, t):
    return theta['alpha']**t + theta['beta']**t

def wrapper(x):
    x = torch.tensor(x, requires_grad=True)
    theta = {}
    theta['alpha'] = x[0]
    theta['beta'] = x[1]
    out = torch.empty(3)
    for t in range(3):
        out[t] = checkpoint.checkpoint(fun, *(x, theta, t))
    y = out.sum()
    y.backward()
    grad = x.grad
    return (y.item(), grad)

print(wrapper([2., 3.]))

If I modify fun to also take x, I still get an a type error saying “save_for_backward can only save variables, but argument 1 is of type dict.” Does this mean that the function in a checkpoint can only take tensors for which requires_grad=True? If so that seems extremely limiting. I don’t even know how this simple example can be rewritten to do that.

You can actually capture what you want implicitly.
Unfortunately the checkpoint was built to work with much simpler function, usually the forward pass from a nn.Module.

You can go around it by doing something like this here:

import torch
from torch.utils import checkpoint

def fun(theta, t):
    return theta['alpha']**t + theta['beta']**t

def wrapper(x):
    x = torch.tensor(x, requires_grad=True)
    theta = {}
    theta['alpha'] = x[0]
    theta['beta'] = x[1]
    out = torch.empty(3)
    for t in range(3):
        out[t] = checkpoint.checkpoint(lambda x: fun(theta, t), x)
    y = out.sum()
    y.backward()
    grad = x.grad
    return (y.item(), grad)

print(wrapper([2., 3.]))

Oh, that’s a neat trick. For some reason the gradient calculation is wrong though. The gradient should be [5., 7.] in this example, but this returns [12., 18.].

Edit: I think was able to determine the source of the problem. For the entirety of the backward pass, the index t is taken to be its last value in the for loop (i.e., 2 in this example). This results in the miscalculated gradient. Is there a way to rewrite this example so the correct gradient is calculated?

Ho lambda and for-loop strikes back! My bad!
tldr: it saved the lambda with references to the python variables, So when the lambda is called during the backward (at the y,backward() line) t has the wrong value.
Using this as the function should work:

# Inside the loop
def fn(x, t=t):
    fun(theta, t)

The default argument is saved at the function creation time, so each iteration will have its own version of the default argument based on the current value of t.

Yes! That works perfectly. Thank you so much for you help. I’ll try implementing this on my actual program and see if it resolves the memory problem. If it does, I suppose I’ll mark your first reply as the solution.

Now I’m getting the following error:

RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.

I only call .backward() once, and setting retain_graph=True just returns the same error. Here is my modified code:


def logL(x, theta, model, data):
    N, S, T = (data['N'], data['S'], data['T'])
    p_a = torch.ones(N * S)
    model.start(theta, data['mat'])
    for t in range(1, T + 1):
        def fn(x, data=data, t=t):
            model.update(data, t)
            return model.p_a
        out = checkpoint.checkpoint(fn, x)
        p_a = p_a * (npeat(data['obs'][:,:,t-1], (S, 1)) * out).sum(1)   
    ll = p_a.reshape(N, S).mean(1).log().sum()   
    return ll 

def fun(x, cons, est, trans, model, data):
    print(x)
    for con in cons:
        valid = np.all(con['fun'](x, *con['args']) > 0)
        if valid == False: break
    if valid == False:
        ll = float('nan')
        grad = np.empty(len(x))
        grad[:] = np.nan
        print('Constraint violation')
        print()
        return (-ll, -grad)     
    else:
        torch.manual_seed(7)
        ex0 = (None,)
        ex1 = (slice(None), None)
        x = torch.tensor(x.astype(np.float32), requires_grad=True)
        ll = 0
        for d in range(len(data)):
            N, S, K = (data[d]['N'], data[d]['S'], data[d]['K'])
            p = 0
            mu = torch.tensor([])
            re = []
            theta = {}
            for (key, val) in est.items():
                if type(val) != str: 
                    theta[key] = torch.tensor(val)
                elif val == 'pool':
                    theta[key] = x[p][ex0]
                    p += 1
                elif val == 'rand':
                    mu = torch.cat((mu, x[p][ex0]))
                    p += 1
                    re.append(key)
            R = len(re)
            di = tuple([torch.tensor(ind, dtype=torch.long) 
                        for ind in np.diag_indices(R)])
            tril = tuple([torch.tensor(ind, dtype=torch.long) 
                          for ind in np.tril_indices(R, -1)])
            triu = tuple([torch.tensor(ind, dtype=torch.long)
                          for ind in np.triu_indices(R, 1)]) 
            sigma = torch.empty(R, R)
            sigma[di] = x[p:p+R]
            sigma[tril] = x[p+R:]
            sigma[triu] = sigma.t()[triu]                
            sample = MultivariateNormal(mu, sigma).rsample((N * S,))
            theta.update({re[r]: trans[re[r]](sample[:,r]) for r in range(R)})
            theta['upsilon'] = torch.zeros(N * S)[ex1]
            if K > 1: 
                theta['upsilon'] = torch.cat((theta['upsilon'], 
                                              theta['upsilon2'][ex1]), 1)
            if K > 2: 
                theta['upsilon'] = torch.cat((theta['upsilon'], 
                                              theta['upsilon3'][ex1]), 1)
            theta['upsilon'] = theta['upsilon'] / theta['kappa']
            if data[d]['treat'] == 'pvd': theta['omega'] = torch.zeros(1)
            ll = ll + logL(x, theta, model, data[d])
        ll = ll / 1e+2
        ll.backward()
        grad = np.array(x.grad.cpu())
        print(ll.item() * 1e+2)
        print()
        return (-ll.item(), -grad)   

Note that fun has only been changed to pass the parameter vector x to logL, and logL has been changed to reflect the changes we made in the toy example.

Ho right. because you “hide” these inputs from the checkpoint code, it cannot properly detach them during the backward…

This is code that I haven’t run so there might be typos in it :smiley:

def detach(obj):
    # A more general detach that opens up dict and tuples
    if torch.is_tensor(obj):
        new = obj.detach()
        new.requires_grad_(obj.requires_grad)
        return new
    elif isinstance(obj, tuple):
        return tuple(detach(el) for el in obj)
    elif isinstance(obj, dict):
        return {k:detach(v) for k, v in obj.items()}
    else:
        return obj


    def fn(x, data=data, t=t):
        data = detach(data)
        t = detach(t)
        model.update(data, t)
        return model.p_a

That will solve this issue.
This checkpoint wasn’t really made for anything but neural nets.Sorry it is so painful :confused:

1 Like

Your new code runs fine but it’s still returning

RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.

So if I understand you correctly, the problem is that checkpoint isn’t detaching all of the tensors for which requires_grad=True. Could it be that the tensor attributes of model need to be detached as well?

If they are just Parameters no need.
But if you perform some ops on Tensors before using them, then yes you will need to detach them before using them here.

Ok, I believe I detached the attributes of model with the following code:

def detach(obj):
    # A more general detach that opens up dict and tuples
    if torch.is_tensor(obj):
        new = obj.detach()
        new.requires_grad_(obj.requires_grad)
        return new
    elif isinstance(obj, tuple):
        return tuple(detach(el) for el in obj)
    elif isinstance(obj, dict):
        return {k:detach(v) for k, v in obj.items()}
    elif isinstance(obj, Model):
        for k, v in vars(obj).items():
            setattr(obj, k, detach(v))
        return obj
    else:
        return obj
def logL(x, theta, model, data):
    N, S, T = (data['N'], data['S'], data['T'])
    p_a = torch.ones(N * S)
    model.start(theta, data['mat'])
    for t in range(1, T + 1):
        def fn(x, model=model, data=data, t=t):
            data = detach(data)
            t = detach(t)
            model = detach(model)
            model.update(data, t)
            return model.p_a
        out = checkpoint.checkpoint(fn, x)
        p_a = p_a * (npeat(data['obs'][:,:,t-1], (S, 1)) * out).sum(1)   
    ll = p_a.reshape(N, S).mean(1).log().sum()   
    return ll 

This gets me past the previous runtime error, but now I’m getting a new one:

File “C:\Users\Peter\Anaconda3\lib\site-packages\scipy\optimize\optimize.py”, line 65, in call
fg = self.fun(x, *args)

File “C:\Users\Peter\Desktop\JMP\Analysis\Original estimation 3-22-19\myLib_new2.py”, line 115, in fun
ll.backward()

File “C:\Users\Peter\AppData\Roaming\Python\Python37\site-packages\torch\tensor.py”, line 198, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)

File “C:\Users\Peter\AppData\Roaming\Python\Python37\site-packages\torch\autograd_init_.py”, line 100, in backward
allow_unreachable=True) # allow_unreachable flag

RuntimeError: The size of tensor a (28000) must match the size of tensor b (56000) at non-singleton dimension 0

Still trying to figure this one out. I know its related to the checkpoint because the model ran fine without it before.

Hmmm you will need to share a stack trace and the line for this one.
This is a quite generic error message so it can come from many places.

Here is the complete stack trace:

Traceback (most recent call last):

File “”, line 1, in
runfile(‘C:/Users/Peter/Desktop/JMP/Analysis/Original estimation 3-22-19/estimations_new2.py’, wdir=‘C:/Users/Peter/Desktop/JMP/Analysis/Original estimation 3-22-19’)

File “C:\Users\Peter\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py”, line 827, in runfile
execfile(filename, namespace)

File “C:\Users\Peter\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py”, line 110, in execfile
exec(compile(f.read(), filename, ‘exec’), namespace)

File “C:/Users/Peter/Desktop/JMP/Analysis/Original estimation 3-22-19/estimations_new2.py”, line 198, in
take_step=take_step, minimizer_kwargs=arg)

File “C:\Users\Peter\Anaconda3\lib\site-packages\scipy\optimize_basinhopping.py”, line 669, in basinhopping
accept_tests, disp=disp)

File “C:\Users\Peter\Anaconda3\lib\site-packages\scipy\optimize_basinhopping.py”, line 74, in init
minres = minimizer(self.x)

File “C:\Users\Peter\Anaconda3\lib\site-packages\scipy\optimize_basinhopping.py”, line 286, in call
return self.minimizer(self.func, x0, **self.kwargs)

File “C:\Users\Peter\Anaconda3\lib\site-packages\scipy\optimize_minimize.py”, line 618, in minimize
constraints, callback=callback, **options)

File “C:\Users\Peter\Anaconda3\lib\site-packages\scipy\optimize\slsqp.py”, line 399, in _minimize_slsqp
fx = func(x)

File “C:\Users\Peter\Anaconda3\lib\site-packages\scipy\optimize\optimize.py”, line 327, in function_wrapper
return function(*(wrapper_args + args))

File “C:\Users\Peter\Anaconda3\lib\site-packages\scipy\optimize\optimize.py”, line 65, in call
fg = self.fun(x, *args)

File “C:\Users\Peter\Desktop\JMP\Analysis\Original estimation 3-22-19\myLib_new2.py”, line 115, in fun
ll.backward()

File “C:\Users\Peter\AppData\Roaming\Python\Python37\site-packages\torch\tensor.py”, line 198, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)

File “C:\Users\Peter\AppData\Roaming\Python\Python37\site-packages\torch\autograd_init_.py”, line 100, in backward
allow_unreachable=True) # allow_unreachable flag

RuntimeError: The size of tensor a (28000) must match the size of tensor b (56000) at non-singleton dimension 0

Ho it happens during the backward. You can set torch.autograd.set_detect_anomaly(True) doc at the beginning of your script to get a second stack trace pointing to the forward op that generated this error during the backward.

I’ve put torch.autograd.set_detect_anomaly(True) at the start of my script but I’m not getting anything new in the stack. I’m not sure what I’m doing wrong.

Edit: I thought maybe I wasn’t getting that traceback you were talking about because I was running it in spyder, but here’s the traceback in a jupyter notebook:

> RuntimeError                              Traceback (most recent call last)
> <ipython-input-1-94a4c9e38816> in <module>
>     198 start = time.time()
>     199 res = optimize.basinhopping(my.fun, x0, niter=niter, T=temp, stepsize=stepsize,
> --> 200                             take_step=take_step, minimizer_kwargs=arg)
>     201 end = time.time()
>     202 sec = end - start
> 
> ~\Anaconda3\lib\site-packages\scipy\optimize\_basinhopping.py in basinhopping(func, x0, niter, T, stepsize, minimizer_kwargs, take_step, accept_test, callback, interval, disp, niter_success, seed)
>     667 
>     668     bh = BasinHoppingRunner(x0, wrapped_minimizer, take_step_wrapped,
> --> 669                             accept_tests, disp=disp)
>     670 
>     671     # start main iteration loop
> 
> ~\Anaconda3\lib\site-packages\scipy\optimize\_basinhopping.py in __init__(self, x0, minimizer, step_taking, accept_tests, disp)
>      72 
>      73         # do initial minimization
> ---> 74         minres = minimizer(self.x)
>      75         if not minres.success:
>      76             self.res.minimization_failures += 1
> 
> ~\Anaconda3\lib\site-packages\scipy\optimize\_basinhopping.py in __call__(self, x0)
>     284             return self.minimizer(x0, **self.kwargs)
>     285         else:
> --> 286             return self.minimizer(self.func, x0, **self.kwargs)
>     287 
>     288 
> 
> ~\Anaconda3\lib\site-packages\scipy\optimize\_minimize.py in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
>     616     elif meth == 'slsqp':
>     617         return _minimize_slsqp(fun, x0, args, jac, bounds,
> --> 618                                constraints, callback=callback, **options)
>     619     elif meth == 'trust-constr':
>     620         return _minimize_trustregion_constr(fun, x0, args, jac, hess, hessp,
> 
> ~\Anaconda3\lib\site-packages\scipy\optimize\slsqp.py in _minimize_slsqp(func, x0, args, jac, bounds, constraints, maxiter, ftol, iprint, disp, eps, callback, **unknown_options)
>     397 
>     398             # Compute objective function
> --> 399             fx = func(x)
>     400             try:
>     401                 fx = float(np.asarray(fx))
> 
> ~\Anaconda3\lib\site-packages\scipy\optimize\optimize.py in function_wrapper(*wrapper_args)
>     325     def function_wrapper(*wrapper_args):
>     326         ncalls[0] += 1
> --> 327         return function(*(wrapper_args + args))
>     328 
>     329     return ncalls, function_wrapper
> 
> ~\Anaconda3\lib\site-packages\scipy\optimize\optimize.py in __call__(self, x, *args)
>      63     def __call__(self, x, *args):
>      64         self.x = numpy.asarray(x).copy()
> ---> 65         fg = self.fun(x, *args)
>      66         self.jac = fg[1]
>      67         return fg[0]
> 
> ~\Desktop\JMP\Analysis\Original estimation 3-22-19\myLib_new2.py in fun(x, cons, est, trans, model, data)
>     115             ll = ll + logL(x, theta, model, data[d])
>     116         ll = ll / 1e+2
> --> 117         ll.backward()
>     118         grad = np.array(x.grad.cpu())
>     119         print(ll.item() * 1e+2)
> 
> ~\AppData\Roaming\Python\Python37\site-packages\torch\tensor.py in backward(self, gradient, retain_graph, create_graph)
>     196                 products. Defaults to ``False``.
>     197         """
> --> 198         torch.autograd.backward(self, gradient, retain_graph, create_graph)
>     199 
>     200     def register_hook(self, hook):
> 
> ~\AppData\Roaming\Python\Python37\site-packages\torch\autograd\__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
>      98     Variable._execution_engine.run_backward(
>      99         tensors, grad_tensors, retain_graph, create_graph,
> --> 100         allow_unreachable=True)  # allow_unreachable flag
>     101 
>     102 
> 
> RuntimeError: The size of tensor a (28000) must match the size of tensor b (56000) at non-singleton dimension 0