dataset = RandomDataset(input_shape, 80*batch_size, rank)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
data_iter = iter(dataloader)
model = model(pretrained=True).to(rank)
optimizer = optim.SGD(model.parameters(), lr=0.0001)
criterion = torch.nn.CrossEntropyLoss()
s = torch.cuda.Stream()
s.wait_stream(torch.cuda.current_stream())
with torch.cuda.stream(s):
print("[MAKING DDP Model]")
model = DDP(model)
print("[MODEL CREATED]")
for i in range(11):
optimizer.zero_grad(set_to_none=True)
inputs, labels = next(data_iter)
output = model(inputs)
loss = criterion(output, labels)
loss.backward()
optimizer.step()
capture_input = torch.empty((batch_size, 3, input_shape, input_shape)).to(rank)
capture_target = torch.argmax(torch.from_numpy(np.eye(1000)[np.random.choice(1000, batch_size)]), axis=1).to(rank)
g = torch.cuda.CUDAGraph()
optimizer.zero_grad(set_to_none=True)
with torch.cuda.graph(g):
capture_y_pred = model(capture_input)
capture_loss = criterion(capture_y_pred, capture_target)
capture_loss.backward()
optimizer.step()
print("RECORDED")
for i in range(20):
inputs, label = next(data_iter)
capture_input.copy_(inputs)
capture_target.copy_(label)
g.replay()
optimizer.step()
print("DATASET DONE")
But I get the following error:
RuntimeError: CUDA error: operation would make the legacy stream depend on a capturing blocking stream
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Is your model working on a single GPU? I see that e.g. the torch.cuda.current_stream().wait_stream(s) is missing before creating the torch.cuda.CUDAGraph() object.
Also, could you add the missing pieces to your code to make it executable (i.e. the actual shape of the inputs etc.)?