Hi everyone,
I know the issue has sometimes been raised, but I couldn’t solve my problem with other posts.
I’m trying to run an experiment with a basic 2-layer MLP. I load a dataset, perform training for a few epochs, and then I want to use the model. However, the error: RuntimeError: CUDA error: an illegal memory access was encountered
is sometimes and I can’t understand why. I thought that the use of torch.cuda.empty_cache()
could do the trick but it doesn’t.
Also, I checked the memory use of my GPU and it’s far from being entirely used.
Here are some details:
- CUDA version: 12.1
- Driver version: 530.30.02
- GPU: NVIDIA GeForce GTX 1070
And here is what my code looks like:
I first load my data from saved loaders.
from torch.utils.data import DataLoader, TensorDataset
train_loader = torch.load("train_loader.pth")
test_loader = torch.load("test_loader.pth")
dataset = torch.load("/dataset.pth")
My data consists in latent representations of time series as inputs and the outputs are the original time series that I’m trying to reconstruct:
for batch in train_loader:
latent_dim = batch[0].shape[1]
output_dim = batch[1].shape[1]
break
print(f"Latent dim: {latent_dim}, output dim: {output_dim}")
Output: Latent dim: 198, output dim: 150
import torch.nn as nn
class Decoder(nn.Module):
def __init__(self, latent_size, out_size):
super().__init__()
self.latent_size = latent_size
self.out_size = out_size
self.linear1 = nn.Linear(latent_size, latent_size)
self.linear2 = nn.Linear(latent_size, out_size)
self.relu = nn.ReLU(True)
def __repr__(self):
return f"2-layer MLP decoder (latent_size={self.latent_size}, out_size={self.out_size})"
def forward(self, z):
out = self.linear1(z)
out = self.relu(out)
out = self.linear2(out)
return out
def training_step(self, batch):
inputs, targets = batch
inputs = inputs.to(DEVICE)
targets = targets.to(DEVICE)
outputs = self(inputs)
return nn.MSELoss()(outputs, targets)
def evaluate(model, data_loader):
outputs = [model.training_step(batch) for batch in data_loader]
return torch.stack(outputs).mean().item()
def training(n_epochs, model, train_loader, test_loader, opt_func=torch.optim.Adam):
torch.cuda.empty_cache()
history = []
optimizer = opt_func(model.parameters(), lr=1e-3)
for epoch in range(n_epochs):
model.train()
train_losses = []
for batch in train_loader:
loss = model.training_step(batch)
loss.backward()
optimizer.step()
optimizer.zero_grad()
train_losses.append(loss.item())
train_loss = evaluate(model, train_loader)
val_loss = evaluate(model, test_loader)
print(
f"Epoch [{epoch+1}/{n_epochs}], train_loss: {train_loss:.4f}, val_loss: {val_loss:.4f}"
)
history.append({"train_loss": train_loss, "val_loss": val_loss})
return history
decoder = Decoder(latent_size=latent_dim, out_size=output_dim)
decoder = decoder.to(DEVICE)
history = training(50, decoder, train_loader, test_loader, opt_func=torch.optim.AdamW)
The issue usually occurs here:
input, target = dataset[0]
input = input.to(DEVICE) #### RAISES the ERROR ####
target = target.to(DEVICE)
with torch.no_grad():
print(decoder(input).shape)
plt.plot(decoder(input), label="prediction")
plt.plot(target, label="target")
Any idea about what I’m doing wrong?