Understanding how torch.nn.Module works

rasbt · January 22, 2017, 9:18am

I was reading through the tutorials but I feel like I still don’t quite understand how torch.nn.Module works. Again, as the simplest possible toy example for exploring the API, I tried to implement OLS regression.

Essentially, I want to reproduce the results I get when I do it “manually:”

from torch.autograd import Variable
import torch


x = Variable(torch.Tensor([[1.0, 1.0], 
                           [1.0, 2.1], 
                           [1.0, 3.6], 
                           [1.0, 4.2], 
                           [1.0, 6.0], 
                           [1.0, 7.0]]))
y = Variable(torch.Tensor([1.0, 2.1, 3.6, 4.2, 6.0, 7.0]))
weights = Variable(torch.zeros(2, 1), requires_grad=True)


for i in range(5000):

    net_input = x.mm(weights)
    loss = torch.mean((net_input - y)**2)
    loss.backward()
    weights.data.add_(-0.0001 * weights.grad.data)
    
    if loss.data[0] < 1e-3:
        break

print('n_iter', i)
print(loss.data[0])

Output:

n_iter 1188
0.0004487129335757345

Now, running the following

import torch.nn.functional as F

class Model(torch.nn.Module):
    
    def __init__(self):
        super(Model, self).__init__()
        self.weights = Variable(torch.zeros(2, 1), 
                                requires_grad=True)
    
    def forward(self, x):
        net_input = x.mm(self.weights)
        return net_input
        
model = Model()
criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

raises an error:

    ---------------------------------------------------------------------------
    IndexError                                Traceback (most recent call last)
    <ipython-input-258-3bcb3a8408d2> in <module>()
         15 model = Model()
         16 criterion = torch.nn.MSELoss()
    ---> 17 optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

    /Users/Sebastian/miniconda3/envs/pytorch/lib/python3.5/site-packages/torch/optim/sgd.py in __init__(self, params, lr, momentum, dampening, weight_decay)
         24         defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
         25                         weight_decay=weight_decay)
    ---> 26         super(SGD, self).__init__(params, defaults)
         27 
         28     def step(self, closure=None):

    /Users/Sebastian/miniconda3/envs/pytorch/lib/python3.5/site-packages/torch/optim/optimizer.py in __init__(self, params, defaults)
         25         self.state = defaultdict(dict)
         26         self.param_groups = list(params)
    ---> 27         if not isinstance(self.param_groups[0], dict):
         28             self.param_groups = [{'params': self.param_groups}]
         29 

    IndexError: list index out of range

So, would I need a layer in forward to make it work?

I tried that:

class Model(torch.nn.Module):
    
    def __init__(self):
        super(Model, self).__init__()
        self.weights = Variable(torch.zeros(2, 1), 
                        requires_grad=True)
        self.fc = torch.nn.Linear(2, 1)
        
    def forward(self, x):
        return x
        
model = Model()
criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)

for i in range(5000):
    optimizer.zero_grad()
    outputs = model(x)
    
    loss = criterion(outputs, y)
    loss.backward()        

    optimizer.step()
    
print(loss.data[0])

but now, I am getting an error about the dimensions of the input:

    ---------------------------------------------------------------------------
    RuntimeError                              Traceback (most recent call last)
    <ipython-input-259-c6bb483f3953> in <module>()
         28     outputs = model(x)
         29 
    ---> 30     loss = criterion(outputs, y)
         31     loss.backward()
         32 

    /Users/Sebastian/miniconda3/envs/pytorch/lib/python3.5/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
        208 
        209     def __call__(self, *input, **kwargs):
    --> 210         result = self.forward(*input, **kwargs)
        211         for hook in self._forward_hooks.values():
        212             hook_result = hook(self, input, result)

    /Users/Sebastian/miniconda3/envs/pytorch/lib/python3.5/site-packages/torch/nn/modules/loss.py in forward(self, input, target)
         21         _assert_no_grad(target)
         22         backend_fn = getattr(self._backend, type(self).__name__)
    ---> 23         return backend_fn(self.size_average)(input, target)
         24 
         25 

    /Users/Sebastian/miniconda3/envs/pytorch/lib/python3.5/site-packages/torch/nn/_functions/thnn/auto.py in forward(self, input, target)
         39         output = input.new(1)
         40         getattr(self._backend, update_output.name)(self._backend.library_state, input, target,
    ---> 41             output, *self.additional_args)
         42         return output
         43 

    RuntimeError: input and target have different number of elements: input[6 x 2] has 12 elements, while target[6] has 6 elements at /Users/soumith/anaconda/conda-bld/pytorch-0.1.6_1484801351127/work/torch/lib/THNN/generic/MSECriterion.c:12

So, I was looking into torch.nn.Linear(2, 1), and it seems to just return the input array in forward, however, in the documentation, it says "Applies a linear transformation to the incoming data: :math:y = Ax + b".

It would be great if someone could help me a bit with regard to what’s going on here and how to use the nn.Module correclty.

Best,
Sebastian

apaszke · January 22, 2017, 10:16am

Both your examples have small errors inside:

When you want to save a Variable as a parameter of a module you should use nn.Parameter class. It is a very very simple subclass of Variable, but it has a special behaviour when assigned as a module attribute - it gets added to the parameter list, and is returned when you call model.parameters(). Variables aren’t added automatically, we thought of many cases where you’d want to cache some Variable in your model, without adding them to parameters (e.g. when writing an recurrent network you might want to save the last hidden state so you don’t have to pass it over and over).

Also Parameters are convenient, because they require_grad by default.

If you inspect list(model.parameters()) you’ll see that it’s empty. I’ll improve the error message though, it’s hard to trace what’s wrong.

You have a bug in forward - your model always just returns the input. Try this:

class Model(torch.nn.Module):
    
    def __init__(self):
        super(Model, self).__init__()
        self.fc = torch.nn.Linear(2, 1)
        
    def forward(self, x):
        return self.fc(x) # it was just x there

rasbt · January 22, 2017, 6:24pm

Thanks a lot, for some reason, I didn’t see the Parameter in the API docs and didn’t know it existed. Makes sense though, feel like I am slowly starting to understanding how things work in PyTorch

apaszke · January 22, 2017, 6:41pm

Huh, I think we’re missing the docs on Parameters, I’ll note that and make sure it’s added soon. Sorry for the confusion.

tanemaki · January 28, 2017, 1:25am

Hi, thanks for clarifying the difference between Variable and Parameter in this thread.

It might be slightly off the topic, but I have a question about the codes used in here.

The very first code written by rasbt, which implemented OLS regression manually using only Variable, looks perfectly fine to me, and I got exactly same output. However, the loss is not monotonically decreasing for some unknown reason. Am I missing something?

By the way, by replaced Variable used in the second code by rasbt to Parameter, the loss decreases monotonically as usual.

I put the result of the code execution under this link (Jupyter Notebook posted in the gist).

rasbt · January 28, 2017, 9:24am

Yeah, same issue here. The first example has some issues with the minimization:

from torch.autograd import Variable
import torch


x = Variable(torch.Tensor([[1.0, 1.0], 
                           [1.0, 2.1], 
                           [1.0, 3.6], 
                           [1.0, 4.2], 
                           [1.0, 6.0], 
                           [1.0, 7.0]]))
y = Variable(torch.Tensor([1.0, 2.1, 3.6, 4.2, 6.0, 7.0]))
weights = Variable(torch.zeros(2, 1), requires_grad=True)

loss1 = []

for i in range(5000):

    net_input = x.mm(weights)
    loss = torch.mean((net_input - y)**2)
    loss.backward()
    weights.data.add_(-0.0001 * weights.grad.data)
    
    loss1.append(loss.data[0])

print('n_iter', i)
print(loss.data[0])

plt.plot(range(5000), loss1)

The 2nd example works well, though:

import torch.nn.functional as F
from torch.nn import Parameter

class Model(torch.nn.Module):
    
    def __init__(self):
        super(Model, self).__init__()
        self.weights = Parameter(torch.zeros(2, 1), 
                                 requires_grad=True)
    
    def forward(self, x):
        net_input = x.mm(self.weights)
        return net_input
        
model = Model()
criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
loss2 = []

for i in range(5000):
    optimizer.zero_grad()
    outputs = model(x)
    
    loss = criterion(outputs, y)
    loss2.append(loss.data[0])
    loss.backward()        

    optimizer.step()
    
plt.plot(range(5000), loss2)

After standardizing ex1, and lowering the learning rate further, it would somewhat work though:

from torch.autograd import Variable
import torch


x = torch.Tensor([[1.0, 1.0], 
                  [1.0, 2.1], 
                  [1.0, 3.6], 
                  [1.0, 4.2], 
                  [1.0, 6.0], 
                  [1.0, 7.0]])

x = (x - x.mean()) / x.max()
x = Variable(x)

y = torch.Tensor([1.0, 2.1, 3.6, 4.2, 6.0, 7.0])
y = (y - y.mean()) / y.max()
y = Variable(y)

weights = Variable(torch.zeros(2, 1), requires_grad=True)

loss1 = []

for i in range(5000):

    net_input = x.mm(weights)
    loss = torch.mean((net_input - y)**2)
    loss.backward()
    weights.data.add_(-0.00000001 * weights.grad.data)
    
    loss1.append(loss.data[0])

print('n_iter', i)
print(loss.data[0])

plt.plot(range(5000), loss1)

This would maybe suggest that there’s some more optimization going on in the .nn modules (unless I have a mistake in the ex1 implementation, or there’s a bug somewhere).

apaszke · January 28, 2017, 11:42am

There’s no magic in the Module implementation, it’s only a simple container and is not involved in optimization process. The problem with your module-less examples is that you never zero the weight gradient weight.grad.data.zero_(), while in the module example you call optimizer.zero_grad() that does it for you. Once I added that they’re both converging to the solution.

rasbt · January 28, 2017, 7:07pm

Ah thanks! I knew I forgot sth! Works perfectly fine now

Kalamaya · January 28, 2017, 7:11pm

Good questions - Ive been trying to grok it too!

tanemaki · January 29, 2017, 12:00am

Great! Now the two codes generate exactly same output. Thanks, I learned a lot from this example

Atcold · January 30, 2017, 1:30pm

Are you planning to collect these examples in a nice repo?
I think they are awesome studying material!
One question on the examples: don’t we need to randomly initialise the matrix in order to break symmetry? -> Oh, well, there are two input neurons and just one output neuron. So, yeah, it is symmetric to itself, ha.

rasbt · January 30, 2017, 9:34pm

Oh, I was actually just trying to figure stuff out but yeah, maybe one could collect a nicer version of these examples

Regarding the symmetry: Yeah, in this case zero weights wouldn’t matter like you said

Kalamaya · February 1, 2017, 6:07am

Hmm, well why do you have to zero them out? You are just computing new gradients, so shouldn’t the old ones just be over-written?

apaszke · February 1, 2017, 11:09am

No, they’re accumulated.

hmishfaq · May 5, 2017, 8:18pm

@apaszke, by “accumulated” do you mean, weight_new_grad = weight_old_grad + weight_grad_on_this_iteration?

apaszke · May 5, 2017, 8:18pm

Yes, that’s what I meant.

Alex_Choy · July 20, 2017, 10:11am

Am I right to say that, to be consistent with the definition of x above, we’d use self.fc = torch.nn.Linear(2, 1, bias=False)?

Brando_Miranda · August 8, 2017, 11:01pm

its hard to follow what ur question is cuz ur title is vague and question details is too long.

vfdev-5 · October 18, 2017, 10:09pm

Hi, I would like to come back to this example and understand why there is a difference between two loss functions:

loss1 = torch.mean((y - y_pred)**2.0)
loss2 = mse_loss(y_pred, y), where mse_loss = nn.MSELoss()

Here is my complete code:

x = Variable(torch.FloatTensor([[1.0, 1.0], 
                                 [1.0, 2.1], 
                                 [1.0, 3.6], 
                                 [1.0, 4.2], 
                                 [1.0, 6.0], 
                                 [1.0, 7.0]]))
y = Variable(torch.FloatTensor([1.0, 2.1, 3.6, 4.2, 6.0, 7.0]))

mse_loss = nn.MSELoss()
weights = Variable(torch.zeros((2, 1)).float(), requires_grad=True)

n = 500
loss1 = []
loss2 = []
weights_grads_history = np.zeros((n, 2))
weights_history = np.zeros((n, 2))
learning_rate = 0.0001

for i in range(n):

    y_pred = x.mm(weights)
    loss = torch.mean((y - y_pred)**2.0)
    loss_ = mse_loss(y_pred, y)    
    loss1.append(loss.data[0])
    loss2.append(loss_.data[0])
    # Compute gradients
    loss.backward()
    
    # Update parameters
    weights_grads_history[i, :] = weights.grad.data.numpy()[:, 0]
    weights_history[i, :] = weights.data.numpy()[:, 0]
    weights.data.sub_(weights.grad.data * learning_rate)

    # You need to clear the existing gradients though, else gradients will be accumulated to existing gradients         
    weights.grad.data.zero_()

    
print("n_iter", i)
print(loss1[-1], loss2[-1])
print("y_pred: ", y_pred)
print("Weights : ", weights)

plt.figure(figsize=(12, 4))
plt.subplot(131)
plt.plot(range(n), loss1, label='loss1')
plt.plot(range(n), loss2, label='loss2')
_ = plt.legend()
plt.subplot(132)
plt.plot(range(n), weights_grads_history[:, 0], label='W_grad_1')
plt.plot(range(n), weights_grads_history[:, 1], label='W_grad_2')
plt.legend()
plt.subplot(133)
plt.plot(range(n), weights_history[:, 0], label='W_1')
plt.plot(range(n), weights_history[:, 1], label='W_2')
_ = plt.legend()

Here is a resulting plot

There is apparent difference between loss functions and nn.MSELoss is lower than manually made one.
Could you hint why it happens like that in my code ?

Thanks

ptrblck · October 19, 2017, 1:27pm

Hi,

your code has a small issue regarding the shapes of y_pred and y.
When you check the sizes of these two Variables,
you will notice, that y_pred has a size of torch.size([6, 1]), while y has a size of torch.size([6]).
Subtracting these Variables results in a new Variable with size torch.size([6, 6]) due to broadcasting.
See the Release notes for more information.

Your code should work when you change the following line:

loss = torch.mean((y - y_pred.view(-1))**2.0)