Digit recognition manually write all methods

Hi all, I’m trying to replicate the same neural network as here by manually writing all the required functions alongside with their derivatives for backpropagation (Linear, ReLU and CrossEntropyLoss). The forward propagation seems to be ok, as if I use the weights and biases of linear1, linear2 and linear3 and pass them through my forward method, I obtain the same loss for a batch of images as in the network created with nn.Sequential, which uses PyTorch defined layers. Anyway, when I call the backward method written by me, it looks like the matrices don’t match the required sizes to be able to multiply them and obtain the weights / biases gradients, and I tried multiple ways to transpose some of those matrices to match the sizes, but in the end I cannot match all of them. Could anyone advice me how should I modify the backward for weights / biases gradients to be correctly computed? :smiley:

Sorry if it isn’t in the right category!

import numpy as np
import torch
import torchvision
import matplotlib.pyplot as plt
from torchvision import datasets, transforms
from torch import nn, optim

transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize((.5,), (.5,)),
                                ])

trainset = datasets.MNIST('PATH_TO_STORE_TRAINSET', download = True,
                          train = True, transform = transform)

valset = datasets.MNIST('PATH_TO_STORE_TESTSET', download = True,
                        train = False, transform = transform)

trainloader = torch.utils.data.DataLoader(trainset, batch_size = 64, 
                                          shuffle = True)

valloader = torch.utils.data.DataLoader(valset, batch_size = 64, 
                                          shuffle = True)

images, labels = next(iter(trainloader))
images = images.view(images.shape[0], -1)

linear1 = nn.Linear(input_size, hidden_sizes[0])
linear2 = nn.Linear(hidden_sizes[0], hidden_sizes[1])
linear3 = nn.Linear(hidden_sizes[1], output_size)
relu = nn.ReLU()
cel = nn.CrossEntropyLoss()

l1 = linear1(images)
r1 = relu(l1)
l2 = linear2(r1)
r2 = relu(l2)
l3 = linear3(r2)
l = cel(l3, labels)
l.backward()

class NN(nn.Module):
    def __init__(self):
        super(NN, self).__init__()
        # parameters
        # self.w1 = torch.randn(128, 784)
        # self.w2 = torch.randn(64, 128)
        # self.w3 = torch.randn(10, 64)
        # self.b1 = torch.rand(128)
        # self.b2 = torch.randn(64)
        # self.b3 = torch.randn(10)
        self.w1 = linear1.weight
        self.w2 = linear2.weight
        self.w3 = linear3.weight
        self.b1 = linear1.bias
        self.b2 = linear2.bias
        self.b3 = linear3.bias
        self.lr = 0.003
        self.momentum = 0.9
    
    def softMax(self, input):
        e_x = torch.exp(input)
        result = torch.zeros(e_x.shape)
        for dim in range(e_x.shape[0]):
            result[dim] = e_x[dim] / torch.sum(e_x[dim])
            
        return result
    
    def crossEntropy(self, input, target):
        m = target.shape[0]
        p = self.softMax(input)
        log_likelihood = -torch.log(p[range(m), target])
        loss = torch.sum(log_likelihood) / m
        return loss
    
    def crossEntropyDeriv(self, input, target):   
        m = target.shape[0]
        grad = self.softMax(input)
        grad[range(m), target] -= 1
        grad /= m
        return grad
        
    def relu(self, input):
        result = torch.max(input, torch.zeros(input.shape))
        return result

    def reluDerivative(self, input):
        result = torch.ones(input.shape)
        result[input < 0] -= 1
        return result
    
    def linear(self, input, w, b):
        return input.matmul(w.t()) + b

    def linearDerivativewrtW(self, input, w, b):
        return input
    
    def linearDerivativewrtB(self, input, w, b):
        return torch.ones(b.shape)
    
    def linearDerivative(self, input, w, b):
        return w
        
    def forward(self, images):
        self.images = images
        self.linear1 = self.linear(self.images, self.w1, self.b1)
        self.relu1 = self.relu(self.linear1)
        self.linear2 = self.linear(self.relu1, self.w2, self.b2)
        self.relu2 = self.relu(self.linear2)
        self.linear3 = self.linear(self.relu2, self.w3, self.b3)
        return self.linear3
    
    def backward(self, output, target):
        deltaL_Linear3 = self.crossEntropyDeriv(output, target)
       
        deltaL_w3 = deltaL_Linear3.matmul(self.linearDerivativewrtW(self.relu2, self.w3, self.b3))
        deltaL_b3 = deltaL_Linear3.matmul(self.linearDerivativewrtB(self.relu2, self.w3, self.b3))
        deltaL_relu2 = deltaL_Linear3.matmul(self.linearDerivative(self.relu2, self.w3, self.b3))
        
        deltaL_Linear2 = deltaL_relu2.matmul(self.reluDerivative(self.linear2))
        
        deltaL_w2 = deltaL_Linear2.matmul(self.linearDerivativewrtW(self.relu1, self.w2, self.b2))
        deltaL_b2 = deltaL_Linear2.matmul(self.linearDerivativewrtB(self.relu1, self.w2, self.b2))
        deltaL_relu1 = deltaL_Linear2.matmul(self.linearDerivative(self.relu1, self.w2, self.b2))
        
        deltaL_Linear1 = deltaL_relu1.matmul(self.reluDerivative(self.linear1))
       
        deltaL_w1 = deltaL_Linear1.matmul(self.linearDerivativewrtW(self.images, self.w1, self.b1))
        deltaL_b1 = deltaL_Linear1.matmul(self.linearDerivativewrtB(self.images, self.w1, self.b1))
        
        self.w1 = self.w1 - self.lr * deltaL_w1
        self.b1 = self.b1 - self.lr * deltaL_b1
        
        self.w2 = self.w2 - self.lr * deltaL_w2
        self.b2 = self.b2 - self.lr * deltaL_b2
        
        self.w3 = self.w3 - self.lr * deltaL_w3
        self.b3 = self.b3 - self.lr * deltaL_b3
        
Network = NN()
output = Network.forward(images)
Network.backward(output, target)
runfile('C:/Users/calser/Downloads/DigitRecognition/Training.py', wdir='C:/Users/calser/Downloads/DigitRecognition')
Traceback (most recent call last):

  File "C:\Users\calser\Downloads\DigitRecognition\Training.py", line 152, in <module>
    Network.backward(output, target)

  File "C:\Users\calser\Downloads\DigitRecognition\Training.py", line 126, in backward
    deltaL_w3 = deltaL_Linear3.matmul(self.linearDerivativewrtW(self.relu2, self.w3, self.b3))

RuntimeError: size mismatch, m1: [64 x 10], m2: [64 x 64] at C:\w\1\s\tmp_conda_3.7_100118\conda\conda-bld\pytorch_1579082551706\work\aten\src\TH/generic/THTensorMath.cpp:136

Please do not implement those functions by yourself, they are already implemented in pytorch, search the documentation for their respective functions.

In order to match up the sizes, I would recommend you debug your program and take caution of tensor.shape and the broadcasting rule