Observing significantly different outputs from torch’s TransformerEncoderLayer when changing batch size (although I used batch_first=True) - anyone knows why?
Code to minimally reproduce:
#######
import torch
import numpy as np
import random
import os
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
def set_random_seeds(seed=0, device='cuda:0'):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.use_deterministic_algorithms(True)
os.environ['PYTHONHASHSEED'] = str(seed)
if device != 'cpu':
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
set_random_seeds()
l1 = torch.nn.TransformerEncoderLayer(1024, 1, 1024, 0.0, batch_first=True).double().cuda()
l1.eval()
x = torch.rand((128, 1024)).double().cuda()
diff = (l1(x)[:2] - l1(x[:2])).abs().mean()
print(diff)
I get diff outputs of around 0.18 with torch version 2.3.1+cu121