Hello,
I’m trying to implement torch.compile in my program, but I’m encountering reproducibility issues with this script using PyTorch 2.7.1:
import os
from functools import partial
import torch
from torch.nn import MultiheadAttention
from torch.testing import assert_close
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" # for deterministic cdist
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
torch.use_deterministic_algorithms(True, warn_only=False)
torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)
#torch.backends.cuda.enable_math_sdp(False)
strict_assert_close = partial(assert_close, atol=0,rtol=0)
device = "cuda"
def fn():
torch._dynamo.reset()
self_attn_builder = partial(MultiheadAttention,embed_dim=4,num_heads=1,dropout=0.1,bias=True,batch_first=True)
torch.manual_seed(0)
tf_raw = self_attn_builder().to(device)
torch.manual_seed(0)
tf_compiled = torch.compile(self_attn_builder().to(device),fullgraph=True)
torch.manual_seed(0)
tranfomer_input = torch.randn(torch.Size([128, 50, 4])).to(device)
torch.manual_seed(0)
output_1 = tf_raw(tranfomer_input,tranfomer_input,tranfomer_input,attn_mask=None,key_padding_mask=None)
torch.manual_seed(0)
output_compiled = tf_compiled(tranfomer_input,tranfomer_input,tranfomer_input,attn_mask=None,key_padding_mask=None)
try:
strict_assert_close(output_compiled, output_1)
return True
except AssertionError:
return False
from torch._inductor.compiler_bisector import CompilerBisector
CompilerBisector.do_bisect(fn)
I can see that the output of aten._softmax
differs between compiled and eager modes, which breaks my reproducibility tests.
Is this expected behavior, or is it a bug? Are there any options to ensure identical results between torch.compile and eager execution?
Thanks for your help!