Some layers might not yield deterministic results due to the usage of atomicAdd
.
You could try to remove these layers from your model and recheck the results or use this simple code snippet to verify that both relu approaches yield the same result:
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.backends.cudnn.deterministic = True
torch.manual_seed(2809)
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 3, 1, 1)
self.conv2 = nn.Conv2d(6, 12, 3, 1, 1)
self.linear = nn.Linear(12*24*24, 100)
self.act = nn.ReLU()
def forward(self, x, use_act):
if use_act:
out = self.act(self.conv1(x))
out = self.act(self.conv2(out))
else:
out = F.relu(self.conv1(x))
out = F.relu(self.conv2(out))
out = out.view(out.size(0), -1)
out = self.linear(out)
return out
model = MyModel().cuda()
x = torch.randn(64, 3, 24, 24).cuda()
out = model(x, use_act=True)
out.mean().backward()
out_reference = out.clone()
grads_ref = []
for p in model.parameters():
grads_ref.append(p.grad.clone())
model.zero_grad()
out = model(x, use_act=False)
out.mean().backward()
grads = []
for p in model.parameters():
grads.append(p.grad.clone())
# Compare
print('output allclose: {}, max abs diff: {}'.format(torch.allclose(out_reference, out), (out_reference - out).abs().max()))
print('grads allclose: {}, max abs diff: {}'.format(
all([torch.allclose(gr, g) for gr, g in zip(grads_ref, grads)]), max([(gr - gr).abs().max() for gr, g in zip(grads_ref, grads)])))
> output allclose: True, max abs diff: 0.0
> grads allclose: True, max abs diff: 0.0