Simple model with tensor produces NaN values on backprop

When you run the below code then the last line - print(test_output[0])- shows that the model returns all ‘nan’ values. Also when I added a print statement to the start of the forward method to print out a weight - print("preforward w: ", w[0][0][0]) - in the 4th round it throws an error RuntimeError: Overflow when unpacking long.

My main problem is that I don’t know why the nan values come. Any ideas?

import torch 
import torchvision
import torch.nn as nn
import numpy as np
import torchvision.transforms as transforms
from torchvision import datasets
from torch.autograd import Variable
import pandas as pd


torch.manual_seed(1)

class MnistModel(nn.Module):
    def __init__(self, w, b):
        super(MnistModel, self).__init__()
        self.w = torch.nn.Parameter(w)
        self.b = torch.nn.Parameter(b)
        self.w2 = torch.nn.Parameter(torch.rand(1000, 784, 10))
        self.w3 = torch.nn.Parameter(torch.rand(1000, 784, 1))

    def forward(self, x):
#         print("preforward w: ", w[0][0][0])
        layer = torch.mul(x , self.w)
        layer = torch.bmm(layer, self.w2)
        layer = layer.view(1000, 10, 784)
        layer = torch.bmm(layer, self.w3)
        output = layer.view(1000, 10)
        return output

batch_size = 1000
classes = 10
train_data = datasets.MNIST('data', train=True, 
                            download=True, 
                            transform=transforms.ToTensor())
train_loader = torch.utils.data.DataLoader(train_data, 
                                           batch_size=batch_size, 
                                           shuffle=False)
test_data = datasets.MNIST('data', train=False, transform=transforms.ToTensor())
test_loader = torch.utils.data.DataLoader(test_data,batch_size=batch_size)    

w = torch.randn(1, 784, requires_grad=True)
b = torch.randn(784, 1, requires_grad=True)
learning_rate = 0.001

model = MnistModel(w, b)


criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
rows = np.array(range(batch_size))
zeros = torch.zeros((batch_size, 10), dtype=torch.float64)
for i, (raw_data, raw_target) in enumerate(train_loader):
    data = raw_data.view((batch_size, 784, 1))
    logits = model(data)

    zeros = pd.DataFrame(0, index=range(batch_size), columns=range(classes))
    zeros.iloc[[rows, raw_target.numpy()]] = 1
    zeros = torch.from_numpy(zeros.as_matrix())
    target = zeros.float()
    loss = criterion(logits, target)
    
    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if i == 3: 
        break
        

for i, (raw_data, raw_target) in enumerate(test_loader):
    if i == 1: 
        break
    data = raw_data.view((batch_size, 784, 1))
    test_output = model(data)
    print(test_output[0])