All of the weights are nan

lifeblack · September 25, 2020, 10:09am

hi I have a very simple linear net:

class Net(nn.Module):
    def __init__(self,measurement_rate,hidden=block_size**2):
        super(Net,self).__init__()
        self.fc=nn.Linear(int(np.floor(hidden*measurement_rate)),hidden)
    def forward(self,x):
        x=self.fc(x)
        return x
def weights_init(m):
    if isinstance(m, nn.Linear):
        m.weight.data.normal_(0.0,0.01)

model=Net(measurement_rate,block_size**2)
model.apply(weights_init)
model.to(device)
learning_rate = 0.001
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate , momentum=0.9)
scheduler = StepLR(optimizer, step_size=200000, gamma=0.5)
criterion=nn.MSELoss()

start_time = time.time()
n_epochs=1000000
print_every=500
test_loss_min=np.Inf
for epoch in range(n_epochs):
    print('Epoch:', epoch,'LR:', scheduler.get_last_lr())
    train_loss=0
    model.train()
    for iteration,(label,data) in enumerate(Train_Loader):
        optimizer.zero_grad()
        output=model(label.to(device))
        loss=criterion(output,data.to(device))
        loss.backward()
        optimizer.step()
        train_loss+=loss.item()*data.shape[0]
        if iteration % print_every==0:
            is_training=model.training
            test_loss=0
            model.eval()
            for (label,data) in Test_Loader:
                output=model(label.to(device))
                loss=criterion(output,data.to(device))
                test_loss+=loss.item()*data.shape[0]
            test_loss=test_loss/len(Test_Loader.sampler)
            if test_loss<=test_loss_min:
                print('\t Test loss decreased ({:.6f}-->{:.6f}). '.format(test_loss_min,test_loss))
                test_loss_min=test_loss
            model.train(mode=is_training)  
    scheduler.step()        
    train_loss=train_loss/len(Train_Loader.sampler)
    print(len(Train_Loader.sampler))
    print('weight',copy.deepcopy(model.fc.weight.data))
    print('Epoch: {} \tTraining loss:{:.6f}. Saving Model...'.format(epoch+1,train_loss))
    torch.save(model.state_dict(),('/content/drive/My Drive/dataset/model_step1.pt'))
end_time = time.time()
print('Duration: {}'.format(end_time - start_time))

I printed the weights. all of them are nan.
loss also is nan.
how can I fix this problem?

ptrblck · September 26, 2020, 8:32am

Are you seeing an increasing loss during your training?
If so, your training is diverging and the model parameters might overflow after a certain number of iterations.
To avoid it you could try to e.g. lower the learning rate.