Is it okay to reuse activation function modules in the network architecture?

ptrblck · March 28, 2020, 11:03am

Some layers might not yield deterministic results due to the usage of atomicAdd.
You could try to remove these layers from your model and recheck the results or use this simple code snippet to verify that both relu approaches yield the same result:

import torch
import torch.nn as nn
import torch.nn.functional as F

torch.backends.cudnn.deterministic = True
torch.manual_seed(2809)

class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 3, 1, 1)
        self.conv2 = nn.Conv2d(6, 12, 3, 1, 1)
        self.linear = nn.Linear(12*24*24, 100)
        self.act = nn.ReLU()

    def forward(self, x, use_act):
        if use_act:
            out = self.act(self.conv1(x))
            out = self.act(self.conv2(out))
        else:
            out = F.relu(self.conv1(x))
            out = F.relu(self.conv2(out))

        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

model = MyModel().cuda()
x = torch.randn(64, 3, 24, 24).cuda()

out = model(x, use_act=True)
out.mean().backward()

out_reference = out.clone()
grads_ref = []
for p in model.parameters():
    grads_ref.append(p.grad.clone())


model.zero_grad()
out = model(x, use_act=False)
out.mean().backward()

grads = []
for p in model.parameters():
    grads.append(p.grad.clone())

# Compare
print('output allclose: {}, max abs diff: {}'.format(torch.allclose(out_reference, out), (out_reference - out).abs().max()))
print('grads allclose: {}, max abs diff: {}'.format(
    all([torch.allclose(gr, g) for gr, g in zip(grads_ref, grads)]), max([(gr - gr).abs().max() for gr, g in zip(grads_ref, grads)])))

> output allclose: True, max abs diff: 0.0
> grads allclose: True, max abs diff: 0.0