Hi all, I have a simple problem.
I have a Network
that just has a line of feed forward Neurons
. Each Neuron
just does y=mx+b on the input, and applies sigmoid. I have two versions of my network:
- Version #1 uses the nn.Module, nn.MSELoss, optim.SGD - this fails to learn.
- Version #2 is similar to #1, but everything is manually calculated in python. The loss is calculated manually. The grads are applied to weights manually - this learns great.
My goal is to get Version #1 working because it uses Pytorch’s functions.
Version #1:
import torch
import torch.nn as nn
import numpy as np
import math
import pandas as pd
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class Neuron(nn.Module):
def __init__(self):
super(Neuron, self).__init__()
self.reset_parameters()
def reset_parameters(self):
factory_kwargs = {'device': device, 'dtype': torch.float}
self.a = torch.nn.Parameter(torch.randn((), **factory_kwargs))
self.b = torch.nn.Parameter(torch.randn((), **factory_kwargs))
self.c = torch.nn.Parameter(torch.randn((), **factory_kwargs))
self.d = torch.nn.Parameter(torch.randn((), **factory_kwargs))
def forward(self, input):
return self.a * torch.sigmoid(input*self.b+self.c) + self.d + input
class MyNetwork(nn.Module):
def __init__(self):
super(MyNetwork, self).__init__()
self.n1 = Neuron()
self.n2 = Neuron()
self.n3 = Neuron()
self.n4 = Neuron()
self.n5 = Neuron()
self.n6 = Neuron()
self.n7 = Neuron()
self.loss_function = nn.MSELoss(reduction='mean')
self.optimizer = torch.optim.SGD(self.parameters(), lr=1e-4)
# self.loss_function = nn.L1Loss(reduction='mean')
# self.optimizer = torch.optim.Adam(self.parameters(), lr=1e-4)
def forward(self, inputs):
inputs = self.n1(inputs)
inputs = self.n2(inputs)
inputs = self.n3(inputs)
inputs = self.n4(inputs)
inputs = self.n5(inputs)
inputs = self.n6(inputs)
inputs = self.n7(inputs)
return inputs
def train(self, inputs, targets):
outputs = self.forward(inputs)
loss = self.loss_function(outputs, targets)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
ANet = MyNetwork()
inputs = torch.tensor(np.linspace(0, 2*math.pi, 2000))
targets = torch.sin(inputs)
for i in range(len(inputs)):
ANet.train(inputs[i], targets[i])
def plotter(x): return [x, ANet.forward(x)]
df = pd.DataFrame(map(plotter, inputs), columns=['input', 'output'])
ax1 = df.astype(float).plot.scatter(x='input', y='output').set_xlim(0, 2*math.pi)
This doesn’t seem to want to learn.
This implements the same thing but we calculate loss and modify the weights and biases manually.
Version #2
# Learning Sin(x)
# Inspired by https://pytorch.org/tutorials/beginner/pytorch_with_examples.html
# y = mx+b
# a1 : stretch vertically
# b1 : stretch horizontally
# c1 : shift horizontally
# d1 : shift vertically
dtype = torch.float
x = torch.tensor(np.linspace(0, 2*math.pi, 2000)) # 2000 tensors from 0 to 2pi
targets = np.sin(x) # y is the actual target that we wish to predict
# define weights and biases for the 7 learning functions: f(x) = m*x + b
a1 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
a2 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
a3 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
a4 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
a5 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
a6 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
a7 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
b1 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
b2 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
b3 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
b4 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
b5 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
b6 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
b7 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
c1 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
c2 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
c3 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
c4 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
c5 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
c6 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
c7 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
d1 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
d2 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
d3 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
d4 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
d5 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
d6 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
d7 = torch.randn((), device=device, dtype=dtype, requires_grad=True)
def leaky_relu(x): return torch.nn.functional.leaky_relu(x)
def relu(x): return torch.nn.functional.relu(x)
def tanh(x): return torch.tanh(x)
def sigmoid(x): return torch.sigmoid(x)
def activation(x): return sigmoid(x)
def predictor(x):
#this is just y=mx+b
y = x
y = a1 * activation(y*b1 + c1) + d1 + y
y = a2 * activation(y*b2 + c2) + d2 + y
y = a3 * activation(y*b3 + c3) + d3 + y
y = a4 * activation(y*b4 + c4) + d4 + y
y = a5 * activation(y*b5 + c5) + d5 + y
y = a6 * activation(y*b6 + c6) + d6 + y
y = a7 * activation(y*b7 + c7) + d7 + y
return y
learning_rate = 1e-4
for t in range(10000):
preds = predictor(x)
loss = (preds - targets).pow(2).sum()
if t % 100 == 99:
print(t, loss.item())
# zero out grad
with torch.no_grad():
a1.grad = a2.grad = a3.grad = a4.grad = a5.grad = a6.grad = a7.grad = b1.grad = b2.grad = b3.grad = b4.grad = b5.grad = b6.grad = b7.grad = c1.grad = c2.grad = c3.grad = c4.grad = c5.grad = c6.grad = c7.grad = d1.grad = d2.grad = d3.grad = d4.grad = d5.grad = d6.grad = d7.grad = None
loss.backward()
# optimizer step
with torch.no_grad():
a1 -= learning_rate * a1.grad
a2 -= learning_rate * a2.grad
a3 -= learning_rate * a3.grad
a4 -= learning_rate * a4.grad
a5 -= learning_rate * a5.grad
a6 -= learning_rate * a6.grad
a7 -= learning_rate * a7.grad
b1 -= learning_rate * b1.grad
b2 -= learning_rate * b2.grad
b3 -= learning_rate * b3.grad
b4 -= learning_rate * b4.grad
b5 -= learning_rate * b5.grad
b6 -= learning_rate * b6.grad
b7 -= learning_rate * b7.grad
c1 -= learning_rate * c1.grad
c2 -= learning_rate * c2.grad
c3 -= learning_rate * c3.grad
c4 -= learning_rate * c4.grad
c5 -= learning_rate * c5.grad
c6 -= learning_rate * c6.grad
c7 -= learning_rate * c7.grad
d1 -= learning_rate * d1.grad
d2 -= learning_rate * d2.grad
d3 -= learning_rate * d3.grad
d4 -= learning_rate * d4.grad
d5 -= learning_rate * d5.grad
d6 -= learning_rate * d6.grad
d7 -= learning_rate * d7.grad
def plotter(x): return [x, predictor(x)]
df = pd.DataFrame(map(plotter, x), columns=['input', 'output'])
ax1 = df.astype(float).plot.scatter(x='input', y='output').set_xlim(0, 2*math.pi)
Each of the versions above is runnable. The goal is to have the network learn the sine function, then plot the chart that it learns. For some reason Version #2 works great but Version #1 fails to learn. Is there something obvious I’m missing?
Your help is greatly appreciated. Thanks guys!