ReLU activation function cause Nan gradient

I’m trying to implement next NN:

class MyModel(torch.nn.Module):
def init(self):
super(MyModel, self).init()
self.conv0 = torch.nn.Conv1d(24,512,5)
self.conv1 = torch.nn.Conv1d(512,512,3,dilation=2)
self.conv2 = torch.nn.Conv1d(512,512,3,dilation=3)
self.conv3 = torch.nn.Conv1d(512,512,1)
self.conv4 = torch.nn.Conv1d(512,1500,1)
self.lin1 = torch.nn.Linear(3000,512)
self.lin2 = torch.nn.Linear(512,512)
self.lin3 = torch.nn.Linear(512,1951)
self.norm1 = torch.nn.BatchNorm1d(512)
self.norm2 = torch.nn.BatchNorm1d(512)
self.norm3 = torch.nn.BatchNorm1d(512)
self.norm4 = torch.nn.BatchNorm1d(512)against
self.norm5 = torch.nn.BatchNorm1d(1500)
self.norm7 = torch.nn.BatchNorm1d(512)
self.rel = torch.nn.ReLU()

def forward(self,x):
    l1out = self.rel(self.conv0(x))
    l1norm = self.norm1(l1out)
    l2out = self.rel(self.conv1(l1norm))
    l2norm = self.norm2(l2out)
    l3out = self.rel(self.conv2(l2norm))
    l3norm = self.norm3(l3out)
    l4out = self.rel(self.conv3(l3norm))
    l4norm = self.norm4(l4out)
    l5out = self.rel(self.conv4(l4norm))
    l5norm = self.norm5(l5out)
    mean = torch.mean(l5norm,dim=2)
    std = torch.std(l5norm,dim=2)
    l6inp = torch.cat([mean,std],dim=1)
    l6out = self.rel(self.lin1(l6inp)) 
    l7out = self.rel(self.lin2(l6out))
    l7norm = self.norm7(l7out)
    l8out = torch.sigmoid(self.lin3(l7norm))
    result = torch.nn.functional.softmax(l8out,dim=1)
    return result

But the gradient of convolution layers, calculated by autograd contains Nans, and when i was using sigmoid instead ReLU, everything was ok. Can somebody explain me the reason of this problem?

Hello @eprokopalo, I have encountered same problem https://discuss.pytorch.org/t/batchnorm-and-relu/76563. I am using Conv1d --> ReLU --> BatchNorm, which gives me NaN. Have you found any reason or solution for this??