Weight Constraining on DataParallel gives nan loss

I am training a Conv network on CIFAR 10 with 2 GPUs. I am using DataParallel() to parallelize the model.

The model is:

 class CNNModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
        #Conv 1
        self.cnn1 = nn.Conv2d(in_channels=3,out_channels=16,kernel_size=3,stride=1,padding=1)
        self.activation = nn.ELU()
        
        #maxpool1
        self.maxpool1 = nn.MaxPool2d(kernel_size=2)
        
        #Conv 2
        self.cnn2 = nn.Conv2d(in_channels=16,out_channels=32,kernel_size=3,stride=1,padding=1)

        #Conv 3
        self.cnn3 = nn.Conv2d(in_channels=32,out_channels=64,kernel_size=3,stride=1,padding=1)
        self.cnn4 = nn.Conv2d(in_channels=64,out_channels=128,kernel_size=3,stride=1,padding=1)
        self.cnn5 = nn.Conv2d(in_channels=128,out_channels=128,kernel_size=3,stride=1,padding=1)
        self.cnn6 = nn.Conv2d(in_channels=128,out_channels=128,kernel_size=3,stride=1,padding=1)

        
        #Maxpool 2
        self.maxpool2 = nn.MaxPool2d(kernel_size=2)
        
        # 8 = (32/2)/2
        self.fc1 = nn.Linear(128*8*8,10)
        
    def forward(self,x):
        # x is of size (64,1,28,28)
        # reshape to (64,784)
        out = self.activation(self.cnn1(x))
        out = self.maxpool1(out)

        out = self.activation(self.cnn2(out))

        out = self.activation(self.cnn3(out))
        out = self.activation(self.cnn4(out))
        out = self.activation(self.cnn5(out))
        out = self.activation(self.cnn6(out))
        
        out = self.maxpool2(out)
        out = out.view(out.shape[0],-1)
        
        out = self.fc1(out)
        return out

The weight constraints I am using are:

class weightConstraint(object):
    def __init__(self):
        pass
    
    def __call__(self,module):
        if hasattr(module,'weight'):
            print("Entered")
            w=module.weight.data
            w[torch.where(w<0)] = 0
            module.weight.data=w

I tried using 2 methods to apply these constaints:
Technique 1:

model = CNNModel()
constaint = weightConstraint()
for key,val in model._modules.items():
    if hasattr(val,'weight'):
        print(key)
        val.apply(constaint)
model = nn.DataParallel(model).to(device)

Technique 2:

model = CNNModel()
model = nn.DataParallel(model).to(device)

constaint = weightConstraint()
for key,val in model._modules['module']._modules.items():
    if hasattr(val,'weight'):
        print(key)
        val.apply(constaint)

Training in both cases gives me loss as nan (and accuracy as 0.10 which means random assignment) from the very first epoch

it is hard to tell what it is going on here. How is your training on single GPU without DP wrapping?

It seems to be working well without the constraint

Anyways, the code now seems to work sometimes, and otherwise for rest of the time

I guess it might be poor model to start with