Hi all, I’m trying to replicate the same neural network as here by manually writing all the required functions alongside with their derivatives for backpropagation (Linear, ReLU and CrossEntropyLoss). The forward propagation seems to be ok, as if I use the weights and biases of linear1, linear2 and linear3 and pass them through my forward method, I obtain the same loss for a batch of images as in the network created with nn.Sequential, which uses PyTorch defined layers. Anyway, when I call the backward method written by me, it looks like the matrices don’t match the required sizes to be able to multiply them and obtain the weights / biases gradients, and I tried multiple ways to transpose some of those matrices to match the sizes, but in the end I cannot match all of them. Could anyone advice me how should I modify the backward for weights / biases gradients to be correctly computed?
Sorry if it isn’t in the right category!
import numpy as np
import torch
import torchvision
import matplotlib.pyplot as plt
from torchvision import datasets, transforms
from torch import nn, optim
transform = transforms.Compose([transforms.ToTensor(),
transforms.Normalize((.5,), (.5,)),
])
trainset = datasets.MNIST('PATH_TO_STORE_TRAINSET', download = True,
train = True, transform = transform)
valset = datasets.MNIST('PATH_TO_STORE_TESTSET', download = True,
train = False, transform = transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size = 64,
shuffle = True)
valloader = torch.utils.data.DataLoader(valset, batch_size = 64,
shuffle = True)
images, labels = next(iter(trainloader))
images = images.view(images.shape[0], -1)
linear1 = nn.Linear(input_size, hidden_sizes[0])
linear2 = nn.Linear(hidden_sizes[0], hidden_sizes[1])
linear3 = nn.Linear(hidden_sizes[1], output_size)
relu = nn.ReLU()
cel = nn.CrossEntropyLoss()
l1 = linear1(images)
r1 = relu(l1)
l2 = linear2(r1)
r2 = relu(l2)
l3 = linear3(r2)
l = cel(l3, labels)
l.backward()
class NN(nn.Module):
def __init__(self):
super(NN, self).__init__()
# parameters
# self.w1 = torch.randn(128, 784)
# self.w2 = torch.randn(64, 128)
# self.w3 = torch.randn(10, 64)
# self.b1 = torch.rand(128)
# self.b2 = torch.randn(64)
# self.b3 = torch.randn(10)
self.w1 = linear1.weight
self.w2 = linear2.weight
self.w3 = linear3.weight
self.b1 = linear1.bias
self.b2 = linear2.bias
self.b3 = linear3.bias
self.lr = 0.003
self.momentum = 0.9
def softMax(self, input):
e_x = torch.exp(input)
result = torch.zeros(e_x.shape)
for dim in range(e_x.shape[0]):
result[dim] = e_x[dim] / torch.sum(e_x[dim])
return result
def crossEntropy(self, input, target):
m = target.shape[0]
p = self.softMax(input)
log_likelihood = -torch.log(p[range(m), target])
loss = torch.sum(log_likelihood) / m
return loss
def crossEntropyDeriv(self, input, target):
m = target.shape[0]
grad = self.softMax(input)
grad[range(m), target] -= 1
grad /= m
return grad
def relu(self, input):
result = torch.max(input, torch.zeros(input.shape))
return result
def reluDerivative(self, input):
result = torch.ones(input.shape)
result[input < 0] -= 1
return result
def linear(self, input, w, b):
return input.matmul(w.t()) + b
def linearDerivativewrtW(self, input, w, b):
return input
def linearDerivativewrtB(self, input, w, b):
return torch.ones(b.shape)
def linearDerivative(self, input, w, b):
return w
def forward(self, images):
self.images = images
self.linear1 = self.linear(self.images, self.w1, self.b1)
self.relu1 = self.relu(self.linear1)
self.linear2 = self.linear(self.relu1, self.w2, self.b2)
self.relu2 = self.relu(self.linear2)
self.linear3 = self.linear(self.relu2, self.w3, self.b3)
return self.linear3
def backward(self, output, target):
deltaL_Linear3 = self.crossEntropyDeriv(output, target)
deltaL_w3 = deltaL_Linear3.matmul(self.linearDerivativewrtW(self.relu2, self.w3, self.b3))
deltaL_b3 = deltaL_Linear3.matmul(self.linearDerivativewrtB(self.relu2, self.w3, self.b3))
deltaL_relu2 = deltaL_Linear3.matmul(self.linearDerivative(self.relu2, self.w3, self.b3))
deltaL_Linear2 = deltaL_relu2.matmul(self.reluDerivative(self.linear2))
deltaL_w2 = deltaL_Linear2.matmul(self.linearDerivativewrtW(self.relu1, self.w2, self.b2))
deltaL_b2 = deltaL_Linear2.matmul(self.linearDerivativewrtB(self.relu1, self.w2, self.b2))
deltaL_relu1 = deltaL_Linear2.matmul(self.linearDerivative(self.relu1, self.w2, self.b2))
deltaL_Linear1 = deltaL_relu1.matmul(self.reluDerivative(self.linear1))
deltaL_w1 = deltaL_Linear1.matmul(self.linearDerivativewrtW(self.images, self.w1, self.b1))
deltaL_b1 = deltaL_Linear1.matmul(self.linearDerivativewrtB(self.images, self.w1, self.b1))
self.w1 = self.w1 - self.lr * deltaL_w1
self.b1 = self.b1 - self.lr * deltaL_b1
self.w2 = self.w2 - self.lr * deltaL_w2
self.b2 = self.b2 - self.lr * deltaL_b2
self.w3 = self.w3 - self.lr * deltaL_w3
self.b3 = self.b3 - self.lr * deltaL_b3
Network = NN()
output = Network.forward(images)
Network.backward(output, target)
runfile('C:/Users/calser/Downloads/DigitRecognition/Training.py', wdir='C:/Users/calser/Downloads/DigitRecognition')
Traceback (most recent call last):
File "C:\Users\calser\Downloads\DigitRecognition\Training.py", line 152, in <module>
Network.backward(output, target)
File "C:\Users\calser\Downloads\DigitRecognition\Training.py", line 126, in backward
deltaL_w3 = deltaL_Linear3.matmul(self.linearDerivativewrtW(self.relu2, self.w3, self.b3))
RuntimeError: size mismatch, m1: [64 x 10], m2: [64 x 64] at C:\w\1\s\tmp_conda_3.7_100118\conda\conda-bld\pytorch_1579082551706\work\aten\src\TH/generic/THTensorMath.cpp:136