Running out of CUDA/GPU memory with LBFGS optimize

I am running my own custom deep belief network code using PyTorch and using the LBFGS optimizer. After optimization starts, my GPU starts to run out of memory, fully running out after a couple of batches, but I’m not sure why. Should I be purging memory after each batch is run through the optimizer? My code is as follows (with the portion of code that causes the problem marked):

def fine_tuning(self, data, labels, num_epochs=10, max_iter=3):
        data : TYPE torch.Tensor
            N x D tensor with N = num samples, D = num dimensions
        labels : TYPE torch.Tensor
            N x 1 vector of labels for each sample
        num_epochs : TYPE, optional
            DESCRIPTION. The default is 10.
        max_iter : TYPE, optional
            DESCRIPTION. The default is 3.


        N = data.shape[0]
        #need to unroll the weights into a typical autoencoder structure
        #encode - code - decode
        for ii in range(len(self.rbm_layers)-1, -1, -1):
        L = len(self.rbm_layers)
        optimizer = torch.optim.LBFGS(params=list(itertools.chain(*[list(self.rbm_layers[ii].parameters()) 
                                                                    for ii in range(L)]
        dataset     =, labels)
        dataloader  =, batch_size=self.batch_size*10, shuffle=True)
        #fine tune weights for num_epochs
        for epoch in range(1,num_epochs+1):
            with torch.no_grad():
                #get squared error before optimization
                v = self.pass_through_full(data)
                err = (1/N) * torch.sum(torch.pow("cpu"), 2))
            print("\nBefore epoch {}, train squared error: {:.4f}\n".format(epoch, err))
           #*******THIS IS THE PROBLEM SECTION*******#
            for ii,(batch,_) in tqdm(enumerate(dataloader), ascii=True, desc="DBN fine-tuning", file=sys.stdout):
                print("Fine-tuning epoch {}, batch {}".format(epoch, ii))
                with torch.no_grad():
                    batch = batch.view(len(batch) , self.rbm_layers[0].visible_units)
                    if self.use_gpu: #are we using a GPU?
                        batch = #if so, send batch to GPU
                    B = batch.shape[0]
                    def closure():
                        output = self.pass_through_full(batch)
                        loss = nn.BCELoss(reduction='sum')(output, batch)/B
                        print("Batch {}, loss: {}\r".format(ii, loss))
                        return loss

The erros I get is:

DBN fine-tuning: 0it [00:00, ?it/s]Fine-tuning epoch 1, batch 0     
Batch 0, loss: 4021.35400390625  
Batch 0, loss: 4017.994873046875  
DBN fine-tuning: 0it [00:00, ?it/s] 
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>   
  File "/home/deep_autoencoder/", line 260, in fine_tuning  
  File "/home/anaconda3/envs/torch_env/lib/python3.8/site-packages/torch/autograd
/", line 15, in decorate_context 
    return func(*args, **kwargs)  
  File "/home/anaconda3/envs/torch_env/lib/python3.8/site-packages/torch/optim/lb", line 425, in step  
    loss, flat_grad, t, ls_func_evals = _strong_wolfe( 
  File "/home/anaconda3/envs/torch_env/lib/python3.8/site-packages/torch/optim/lb", line 96, in _strong_wolfe 
    g_prev = g_new.clone(memory_format=torch.contiguous_format) 
RuntimeError: CUDA out of memory. Tried to allocate 1.57 GiB (GPU 0; 24.00 GiB total capac
ity; 13.24 GiB already allocated; 1.41 GiB free; 20.07 GiB reserved in total by PyTorch)

This also racks up memory if I use CPU, so I’m not sure what the solution is here…