CUDAGraph generate different results

Hello, I’m trying to use CUDAGraphs however, I cannot manage to obtain the same results from a regular model and its graph version.

I’m using its example code, nothing fancy:

import torch
import numpy as np
import random


def random_seed(seed_value, use_cuda):
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    random.seed(seed_value) # Python
    if use_cuda:
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

random_seed(42,True)

N, D_in, H, D_out = 640, 4096, 2048, 1024

# Placeholders used for capture
static_input = torch.randn(N, D_in, device='cuda')
static_target = torch.randn(N, D_out, device='cuda')

real_inputs = [torch.rand_like(static_input) for _ in range(10)]
real_targets = [torch.rand_like(static_target) for _ in range(10)]

####### Torch model
model_torch = torch.nn.Sequential(torch.nn.Linear(D_in, H),
                            torch.nn.Dropout(p=0.2),
                            torch.nn.Linear(H, D_out),
                            torch.nn.Dropout(p=0.1)).cuda()
loss_torch = torch.nn.MSELoss()
opt_torch = torch.optim.SGD(model_torch.parameters(), lr=0.1)

for data, target in zip(real_inputs, real_targets):
    opt_torch.zero_grad(set_to_none=True)
    y_pred = model_torch(data)
    loss = loss_torch(y_pred, target)
    loss.backward()
    opt_torch.step()

###### CUDA Graph model
random_seed(42,True)

model = torch.nn.Sequential(torch.nn.Linear(D_in, H),
                            torch.nn.Dropout(p=0.2),
                            torch.nn.Linear(H, D_out),
                            torch.nn.Dropout(p=0.1)).cuda()
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

# warmup
s = torch.cuda.Stream()
s.wait_stream(torch.cuda.current_stream())
with torch.cuda.stream(s):
    for inp, targ in zip(real_inputs[:3], real_targets[:3]):
        optimizer.zero_grad(set_to_none=True)
        y_pred_ = model(inp)
        loss = loss_fn(y_pred_, targ)
        loss.backward()
        optimizer.step()
torch.cuda.current_stream().wait_stream(s)

# capture
g = torch.cuda.CUDAGraph()
# Sets grads to None before capture, so backward() will create
# .grad attributes with allocations from the graph's private pool
with torch.cuda.graph(g):
    optimizer.zero_grad(set_to_none=True)
    static_y_pred = model(static_input)
    static_loss = loss_fn(static_y_pred, static_target)
    static_loss.backward()
    optimizer.step()

for data, target in zip(real_inputs, real_targets):
    static_input.copy_(data)
    static_target.copy_(target)
    g.replay()

# As the graph training loop last input it's real_inputs[-1], I just compare them
normal_res = model_torch(real_inputs[-1])

# Tensors are very different
# print(normal_res)
# print(static_y_pred)

assert np.allclose(static_y_pred.cpu().detach().numpy(), normal_res.cpu().detach().numpy(), atol=1e-3)  # Always fails

What am I missing here? I also tried to do the warmup and capture parts with the torch model, but it didn’t work.

I have also another question (which it might be as well another post): How do I only run inference with CUDAGraphs? Capturing with the graph 2 different graphs, splitting it in to 2 parts (forward and backward)? There is no documentation about it