GPU memory increasing during Pytorch model evaluations

Hi,

I’m working in GPytorch which uses Pytorch for Gaussian process regression. My problem requires that I train a number of GPs (600 in total). They are each trained on the same input, but different outputs. The training process was fine on the GPU and the results were then saved locally using state_dict().

The problem is that I now want to make predictions with each of these GPs. I import them back into Python and, for every model, evaluate model predictions on the the testing input. I plot a snippet of the model output against the true test data. The results of the plots are good, but every prediction (the line preds = lhood(ImportedGPs[i](test_x))) seems to increase GPU memory use by approx. 0.25GB, and is not released by the end of the loop.

I am not sure exactly where my memory leak is - I’ve used with torch.no_grad(), I’ve deleted all objects produced inside the loop, and used various techniques I found online, e.g. gc.collect() and torch.cuda.empty_cache().

Some help with this would be greatly appreciated. I am happy to upload local files (e.g. imports at start and GP model .pth files) if needed.

# Standard packages
import matplotlib.pyplot as plt
import numpy as np
import torch
import gpytorch
import gc 

# Created local files
from import_txt import import_txt
from Parameters import Parameters

################################################################################################################

# Import raw X and Y data
K_data = import_txt("K_3x3.txt")
p_data = import_txt("P_3x3.txt")

# Set an object that carries all parameters and (standardised) data. There is no GPU use here.
par = Parameters(K_data, p_data,epochs = 200, learning_rate = 0.1, learning_rate_update = 0.5, M = 3)


# Set device name and move (standardised version of) data to gpu
output_device = torch.device('cuda:0')
train_x = torch.tensor(par.TrainX.astype(np.float32))
train_y = torch.tensor(par.TrainY.astype(np.float32))
test_x = torch.tensor(par.TestX.astype(np.float32))
train_x, train_y, test_x = train_x.to(output_device), train_y.to(output_device), test_x.to(output_device)

#################################################################################################################

# Define GP model in order to import all GP models
class GaussianProcessModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(GaussianProcessModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        base_covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())

        self.covar_module = gpytorch.kernels.MultiDeviceKernel(base_covar_module,
                                                                device_ids=range(torch.cuda.device_count()),
                                                                output_device=torch.device('cuda:0'))

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

# Importing all 600 GPs (previously trained on GPU and saved locally) to list
ImportedGPs = []
for time_index in range(1,par.times+1):
    for sensor_id in list(range(1,par.sensors_per_time+1)):
        path = 'GPs3x3/GP_' + str(sensor_id) + '_' + str(time_index) +'.pth'
        state_dict = torch.load(path)
        model = GaussianProcessModel(train_x, train_y[:,(time_index-1)*par.sensors_per_time + (sensor_id-1)], gpytorch.likelihoods.GaussianLikelihood().to('cuda:0'))  # Create a new GP model
        model.train()
        model.load_state_dict(state_dict)
        model.eval()
        ImportedGPs.append(model)

#################################################################################################################

# Test GPU usage
memory_start = torch.cuda.memory_allocated()/1e9
print(f"GPU memory start: {torch.cuda.memory_allocated()/1e9}GB")

lhood = par.likelihood.to('cuda:0') # move likelihood to gpu
[ImportedGPs[i].to('cuda:0') for i in range(len(ImportedGPs))] # move all models to gpu

with torch.no_grad():  
    for i in range(len(ImportedGPs)):
        print(i)
        preds = lhood(ImportedGPs[i](test_x)) # make predictions using ith model
        preds_mean = preds.mean.detach().cpu().numpy()*par.std_p + par.mean_p # corrects the standardised mean
        preds_std = preds.stddev.detach().cpu().numpy()*par.std_p # corrects the standardised std
    
        # Plot GP mean and 50% uncertainty with the true values in test set (first 50 test instances)
        plt.plot(list(range(50)),preds_mean[0:50],label="ImportedGPs")
        plt.plot(list(range(50)),par.TestY[0:50,i]*par.std_p+par.mean_p,label="True")
        plt.fill_between(list(range(50)),preds_mean[0:50]-0.674*preds_std[0:50],preds_mean[0:50]+0.674*preds_std[0:50],color="green",alpha=0.2)
        plt.legend()
        plt.ylim([0, 7e5])
        plt.show()
    
        del preds, preds_mean, preds_std
        
        gc.collect()
        torch.cuda.empty_cache() 
    
gc.collect()
torch.cuda.empty_cache() 

lhood = par.likelihood.to('cpu') # move likelihood back to cpu
[ImportedGPs[i].to('cpu') for i in range(len(ImportedGPs))] # move all models back to cpu

print(f"GPU end: {torch.cuda.memory_allocated()/1e9}GB")
print(f"GPU memory change: {torch.cuda.memory_allocated()/1e9 - memory_start}GB")

I don’t see anything obviously wrong but also don’t know what e.g. par is and if its attributes might need to be detached.
In any case, could you remove the plotting code and see if this would change the memory usage?