I did two experiments one with batch size 5 with accumulation_steps = 2 and another with batch size 10 with accumulation_steps = 1, no other code changes and I expected to see similar loss values on wandb for both experiments but they weren’t. I used the same seed, learning rate (no lr schedule), no dropout, set deterministic to true and data shuffling off, also my model uses instance norm and not batch norm. Here is the code if anyone can help please
optimizer = Adam(model.parameters(), lr=config.training.lr)
scaler = GradScaler()
counter = 0
total_loss = 0.0
accumulation_steps = 2
optimizer.zero_grad()
for batch in tqdm(train_loader):
y = batch.to(device)
x = y.sum(1)
if config.training.target_instrument is not None:
i = config.training.instruments.index(config.training.target_instrument)
y = y[:,i]
with torch.cuda.amp.autocast():
y_ = model(x)
loss = nn.MSELoss()(y_, y)
scaler.scale(loss).backward()
if (counter + 1) % accumulation_steps == 0:
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad(set_to_none=True)
total_loss += loss.item()
counter += 1
if counter % 100 == 0:
average_loss = total_loss / 100
wandb.log({"loss_100_steps": average_loss})
total_loss = 0.0