Hi,
I’m doing my deep learning assignment and I moved over my code from Kaggle with P100 to a computing cluster with V100. The code and config stays the same, but when I ran the training, the model can’t converge (loss goes up, loss goes to nan, etc) even though it’s doing just fine in Kaggle.
Any ideas on what could’ve happened or how to fix this?
I tried isolating the issue, it may be due to torch.autocast.
When I removed with torch.autocast(device_type='cuda')
, it works just fine.
def train_model(model, train_loader, criterion, optimizer):
model.train()
batch_bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train')
total_loss = 0
for i, data in enumerate(train_loader):
optimizer.zero_grad()
x, y, lx, ly = data
x, y = x.to(device), y.to(device)
lx, ly = lx.to(device), ly.to(device)
with torch.autocast(device_type='cuda'):
h, lh = model(x, lx)
h = torch.permute(h, (1, 0, 2))
loss = criterion(h, y, lh, ly)
total_loss += loss.item()
batch_bar.set_postfix(
loss="{:.04f}".format(float(total_loss / (i + 1))),
lr="{:.06f}".format(float(optimizer.param_groups[0]['lr'])))
batch_bar.update()
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
if scheduler:
if isinstance(scheduler, (torch.optim.lr_scheduler.CosineAnnealingLR,
torch.optim.lr_scheduler.OneCycleLR)):
scheduler.step()
del x, y, lx, ly, h, lh, loss
torch.cuda.empty_cache()
batch_bar.close()
return total_loss / len(train_loader)
Best,