# Digit recognition manually write all methods

Hi all, I’m trying to replicate the same neural network as here by manually writing all the required functions alongside with their derivatives for backpropagation (Linear, ReLU and CrossEntropyLoss). The forward propagation seems to be ok, as if I use the weights and biases of linear1, linear2 and linear3 and pass them through my forward method, I obtain the same loss for a batch of images as in the network created with nn.Sequential, which uses PyTorch defined layers. Anyway, when I call the backward method written by me, it looks like the matrices don’t match the required sizes to be able to multiply them and obtain the weights / biases gradients, and I tried multiple ways to transpose some of those matrices to match the sizes, but in the end I cannot match all of them. Could anyone advice me how should I modify the backward for weights / biases gradients to be correctly computed? Sorry if it isn’t in the right category!

``````import numpy as np
import torch
import torchvision
import matplotlib.pyplot as plt
from torchvision import datasets, transforms
from torch import nn, optim

transform = transforms.Compose([transforms.ToTensor(),
transforms.Normalize((.5,), (.5,)),
])

train = True, transform = transform)

train = False, transform = transform)

shuffle = True)

shuffle = True)

images = images.view(images.shape, -1)

linear1 = nn.Linear(input_size, hidden_sizes)
linear2 = nn.Linear(hidden_sizes, hidden_sizes)
linear3 = nn.Linear(hidden_sizes, output_size)
relu = nn.ReLU()
cel = nn.CrossEntropyLoss()

l1 = linear1(images)
r1 = relu(l1)
l2 = linear2(r1)
r2 = relu(l2)
l3 = linear3(r2)
l = cel(l3, labels)
l.backward()

class NN(nn.Module):
def __init__(self):
super(NN, self).__init__()
# parameters
# self.w1 = torch.randn(128, 784)
# self.w2 = torch.randn(64, 128)
# self.w3 = torch.randn(10, 64)
# self.b1 = torch.rand(128)
# self.b2 = torch.randn(64)
# self.b3 = torch.randn(10)
self.w1 = linear1.weight
self.w2 = linear2.weight
self.w3 = linear3.weight
self.b1 = linear1.bias
self.b2 = linear2.bias
self.b3 = linear3.bias
self.lr = 0.003
self.momentum = 0.9

def softMax(self, input):
e_x = torch.exp(input)
result = torch.zeros(e_x.shape)
for dim in range(e_x.shape):
result[dim] = e_x[dim] / torch.sum(e_x[dim])

return result

def crossEntropy(self, input, target):
m = target.shape
p = self.softMax(input)
log_likelihood = -torch.log(p[range(m), target])
loss = torch.sum(log_likelihood) / m
return loss

def crossEntropyDeriv(self, input, target):
m = target.shape

def relu(self, input):
result = torch.max(input, torch.zeros(input.shape))
return result

def reluDerivative(self, input):
result = torch.ones(input.shape)
result[input < 0] -= 1
return result

def linear(self, input, w, b):
return input.matmul(w.t()) + b

def linearDerivativewrtW(self, input, w, b):
return input

def linearDerivativewrtB(self, input, w, b):

def linearDerivative(self, input, w, b):
return w

def forward(self, images):
self.images = images
self.linear1 = self.linear(self.images, self.w1, self.b1)
self.relu1 = self.relu(self.linear1)
self.linear2 = self.linear(self.relu1, self.w2, self.b2)
self.relu2 = self.relu(self.linear2)
self.linear3 = self.linear(self.relu2, self.w3, self.b3)
return self.linear3

def backward(self, output, target):
deltaL_Linear3 = self.crossEntropyDeriv(output, target)

deltaL_w3 = deltaL_Linear3.matmul(self.linearDerivativewrtW(self.relu2, self.w3, self.b3))
deltaL_b3 = deltaL_Linear3.matmul(self.linearDerivativewrtB(self.relu2, self.w3, self.b3))
deltaL_relu2 = deltaL_Linear3.matmul(self.linearDerivative(self.relu2, self.w3, self.b3))

deltaL_Linear2 = deltaL_relu2.matmul(self.reluDerivative(self.linear2))

deltaL_w2 = deltaL_Linear2.matmul(self.linearDerivativewrtW(self.relu1, self.w2, self.b2))
deltaL_b2 = deltaL_Linear2.matmul(self.linearDerivativewrtB(self.relu1, self.w2, self.b2))
deltaL_relu1 = deltaL_Linear2.matmul(self.linearDerivative(self.relu1, self.w2, self.b2))

deltaL_Linear1 = deltaL_relu1.matmul(self.reluDerivative(self.linear1))

deltaL_w1 = deltaL_Linear1.matmul(self.linearDerivativewrtW(self.images, self.w1, self.b1))
deltaL_b1 = deltaL_Linear1.matmul(self.linearDerivativewrtB(self.images, self.w1, self.b1))

self.w1 = self.w1 - self.lr * deltaL_w1
self.b1 = self.b1 - self.lr * deltaL_b1

self.w2 = self.w2 - self.lr * deltaL_w2
self.b2 = self.b2 - self.lr * deltaL_b2

self.w3 = self.w3 - self.lr * deltaL_w3
self.b3 = self.b3 - self.lr * deltaL_b3

Network = NN()
output = Network.forward(images)
Network.backward(output, target)
``````
``````runfile('C:/Users/calser/Downloads/DigitRecognition/Training.py', wdir='C:/Users/calser/Downloads/DigitRecognition')
Traceback (most recent call last):

In order to match up the sizes, I would recommend you debug your program and take caution of `tensor.shape` and the broadcasting rule