Hi everyone, I encountered an strange error today… A RuntimeError: CUDA error: an illegal memory access was encountered
pops up at torch.cuda.empty_cache()
.
Even more peculiarly, this issue comes out at the 39th epoch of a training session… How could that be?
Info:
Traceback (most recent call last):
File "build_model_and_train.py", line 206, in <module>
train_loss, train_acc = train()
File "build_model_and_train.py", line 105, in train
torch.cuda.empty_cache()
File "/public/workspace/z/miniconda3/envs/ST-Torch/lib/python3.7/site-packages/torch/cuda/memory.py", line 35, in empty_cache
torch._C._cuda_emptyCache()
Code snippet:
def train():
model.train()
loss_all=0
correct = 0
for i, data in enumerate(train_loader, 0):
torch.cuda.empty_cache() # This is where the issue lies!
data = data.to(device)
optimizer.zero_grad()
output = model(data.x, data.batch)
torch.cuda.empty_cache() # This is not where the issue lies.
label = data.y.to(device)
loss = loss_func(output, label)
loss.backward()
loss_all += loss.item()
output = output.detach().cpu().numpy().squeeze()
label = label.detach().cpu().numpy().squeeze()
correct += (abs(output-label)<0.5).sum()
optimizer.step()
return loss_all / len(train_dataset), correct / len(train_dataset)
## Start training
# Omitted dataset generation, DataLoader, and initial setup
os.environ["CUDA_VISIBLE_DEVICES"] = '2'
......
device = torch.device('cuda')
edge_index = edge_index.to(device)
model = GCN().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay = l2_reg)
loss_func = torch.nn.BCELoss() # binary cross-entropy
train_loader = DataLoader(train_dataset, batch_size=batch_size, drop_last = True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
for epoch in range(num_epochs):
gc.collect()
torch.cuda.empty_cache() # Yes I put this everywhere because I am suffering from OOMs...
train_loss, train_acc = train()