Hello, I’m trying to use CUDAGraphs however, I cannot manage to obtain the same results from a regular model and its graph version.
I’m using its example code, nothing fancy:
import torch
import numpy as np
import random
def random_seed(seed_value, use_cuda):
np.random.seed(seed_value) # cpu vars
torch.manual_seed(seed_value) # cpu vars
random.seed(seed_value) # Python
if use_cuda:
torch.cuda.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value) # gpu vars
torch.backends.cudnn.deterministic = True #needed
torch.backends.cudnn.benchmark = False
random_seed(42,True)
N, D_in, H, D_out = 640, 4096, 2048, 1024
# Placeholders used for capture
static_input = torch.randn(N, D_in, device='cuda')
static_target = torch.randn(N, D_out, device='cuda')
real_inputs = [torch.rand_like(static_input) for _ in range(10)]
real_targets = [torch.rand_like(static_target) for _ in range(10)]
####### Torch model
model_torch = torch.nn.Sequential(torch.nn.Linear(D_in, H),
torch.nn.Dropout(p=0.2),
torch.nn.Linear(H, D_out),
torch.nn.Dropout(p=0.1)).cuda()
loss_torch = torch.nn.MSELoss()
opt_torch = torch.optim.SGD(model_torch.parameters(), lr=0.1)
for data, target in zip(real_inputs, real_targets):
opt_torch.zero_grad(set_to_none=True)
y_pred = model_torch(data)
loss = loss_torch(y_pred, target)
loss.backward()
opt_torch.step()
###### CUDA Graph model
random_seed(42,True)
model = torch.nn.Sequential(torch.nn.Linear(D_in, H),
torch.nn.Dropout(p=0.2),
torch.nn.Linear(H, D_out),
torch.nn.Dropout(p=0.1)).cuda()
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
# warmup
s = torch.cuda.Stream()
s.wait_stream(torch.cuda.current_stream())
with torch.cuda.stream(s):
for inp, targ in zip(real_inputs[:3], real_targets[:3]):
optimizer.zero_grad(set_to_none=True)
y_pred_ = model(inp)
loss = loss_fn(y_pred_, targ)
loss.backward()
optimizer.step()
torch.cuda.current_stream().wait_stream(s)
# capture
g = torch.cuda.CUDAGraph()
# Sets grads to None before capture, so backward() will create
# .grad attributes with allocations from the graph's private pool
with torch.cuda.graph(g):
optimizer.zero_grad(set_to_none=True)
static_y_pred = model(static_input)
static_loss = loss_fn(static_y_pred, static_target)
static_loss.backward()
optimizer.step()
for data, target in zip(real_inputs, real_targets):
static_input.copy_(data)
static_target.copy_(target)
g.replay()
# As the graph training loop last input it's real_inputs[-1], I just compare them
normal_res = model_torch(real_inputs[-1])
# Tensors are very different
# print(normal_res)
# print(static_y_pred)
assert np.allclose(static_y_pred.cpu().detach().numpy(), normal_res.cpu().detach().numpy(), atol=1e-3) # Always fails
What am I missing here? I also tried to do the warmup and capture parts with the torch model, but it didn’t work.
I have also another question (which it might be as well another post): How do I only run inference with CUDAGraphs? Capturing with the graph 2 different graphs, splitting it in to 2 parts (forward and backward)? There is no documentation about it