Hi,
I’m working in GPytorch which uses Pytorch for Gaussian process regression. My problem requires that I train a number of GPs (600 in total). They are each trained on the same input, but different outputs. The training process was fine on the GPU and the results were then saved locally using state_dict()
.
The problem is that I now want to make predictions with each of these GPs. I import them back into Python and, for every model, evaluate model predictions on the the testing input. I plot a snippet of the model output against the true test data. The results of the plots are good, but every prediction (the line preds = lhood(ImportedGPs[i](test_x))
) seems to increase GPU memory use by approx. 0.25GB, and is not released by the end of the loop.
I am not sure exactly where my memory leak is - I’ve used with torch.no_grad()
, I’ve deleted all objects produced inside the loop, and used various techniques I found online, e.g. gc.collect()
and torch.cuda.empty_cache()
.
Some help with this would be greatly appreciated. I am happy to upload local files (e.g. imports at start and GP model .pth files) if needed.
# Standard packages
import matplotlib.pyplot as plt
import numpy as np
import torch
import gpytorch
import gc
# Created local files
from import_txt import import_txt
from Parameters import Parameters
################################################################################################################
# Import raw X and Y data
K_data = import_txt("K_3x3.txt")
p_data = import_txt("P_3x3.txt")
# Set an object that carries all parameters and (standardised) data. There is no GPU use here.
par = Parameters(K_data, p_data,epochs = 200, learning_rate = 0.1, learning_rate_update = 0.5, M = 3)
# Set device name and move (standardised version of) data to gpu
output_device = torch.device('cuda:0')
train_x = torch.tensor(par.TrainX.astype(np.float32))
train_y = torch.tensor(par.TrainY.astype(np.float32))
test_x = torch.tensor(par.TestX.astype(np.float32))
train_x, train_y, test_x = train_x.to(output_device), train_y.to(output_device), test_x.to(output_device)
#################################################################################################################
# Define GP model in order to import all GP models
class GaussianProcessModel(gpytorch.models.ExactGP):
def __init__(self, train_x, train_y, likelihood):
super(GaussianProcessModel, self).__init__(train_x, train_y, likelihood)
self.mean_module = gpytorch.means.ConstantMean()
base_covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
self.covar_module = gpytorch.kernels.MultiDeviceKernel(base_covar_module,
device_ids=range(torch.cuda.device_count()),
output_device=torch.device('cuda:0'))
def forward(self, x):
mean_x = self.mean_module(x)
covar_x = self.covar_module(x)
return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
# Importing all 600 GPs (previously trained on GPU and saved locally) to list
ImportedGPs = []
for time_index in range(1,par.times+1):
for sensor_id in list(range(1,par.sensors_per_time+1)):
path = 'GPs3x3/GP_' + str(sensor_id) + '_' + str(time_index) +'.pth'
state_dict = torch.load(path)
model = GaussianProcessModel(train_x, train_y[:,(time_index-1)*par.sensors_per_time + (sensor_id-1)], gpytorch.likelihoods.GaussianLikelihood().to('cuda:0')) # Create a new GP model
model.train()
model.load_state_dict(state_dict)
model.eval()
ImportedGPs.append(model)
#################################################################################################################
# Test GPU usage
memory_start = torch.cuda.memory_allocated()/1e9
print(f"GPU memory start: {torch.cuda.memory_allocated()/1e9}GB")
lhood = par.likelihood.to('cuda:0') # move likelihood to gpu
[ImportedGPs[i].to('cuda:0') for i in range(len(ImportedGPs))] # move all models to gpu
with torch.no_grad():
for i in range(len(ImportedGPs)):
print(i)
preds = lhood(ImportedGPs[i](test_x)) # make predictions using ith model
preds_mean = preds.mean.detach().cpu().numpy()*par.std_p + par.mean_p # corrects the standardised mean
preds_std = preds.stddev.detach().cpu().numpy()*par.std_p # corrects the standardised std
# Plot GP mean and 50% uncertainty with the true values in test set (first 50 test instances)
plt.plot(list(range(50)),preds_mean[0:50],label="ImportedGPs")
plt.plot(list(range(50)),par.TestY[0:50,i]*par.std_p+par.mean_p,label="True")
plt.fill_between(list(range(50)),preds_mean[0:50]-0.674*preds_std[0:50],preds_mean[0:50]+0.674*preds_std[0:50],color="green",alpha=0.2)
plt.legend()
plt.ylim([0, 7e5])
plt.show()
del preds, preds_mean, preds_std
gc.collect()
torch.cuda.empty_cache()
gc.collect()
torch.cuda.empty_cache()
lhood = par.likelihood.to('cpu') # move likelihood back to cpu
[ImportedGPs[i].to('cpu') for i in range(len(ImportedGPs))] # move all models back to cpu
print(f"GPU end: {torch.cuda.memory_allocated()/1e9}GB")
print(f"GPU memory change: {torch.cuda.memory_allocated()/1e9 - memory_start}GB")