Hi,
Please somebody help me understand why my Pytorch cost not converging while my Numpy does (using the same logic). I am trying to create a fizbuz program similar to Joel’s tensorflow implementation.
I have 2 layer dense network with sigmoid activation in both layers. I am using MSE for cost and same hyperparameters for both numpy and pytorch script. My numpy script converges to the global minima with less than 1k epochs while Pytorch is jumping around even after 5k.
Pytorch implementation:
import numpy as np
import torch as th
from torch.autograd import Variable
input_size = 10
epochs = 1000
batches = 64
lr = 0.01
def binary_enc(num):
ret = [int(i) for i in '{0:b}'.format(num)]
return [0] * (input_size - len(ret)) + ret
def binary_dec(array):
ret = 0
for i in array:
ret = ret * 2 + int(i)
return ret
def training_test_gen(x, y):
assert len(x) == len(y)
indices = np.random.permutation(range(len(x)))
split_size = int(0.9 * len(indices))
trX = x[indices[:split_size]]
trY = y[indices[:split_size]]
teX = x[indices[split_size:]]
teY = y[indices[split_size:]]
return trX, trY, teX, teY
def x_y_gen():
x = []
y = []
for i in range(1000):
x.append(binary_enc(i))
if i % 15 == 0:
y.append([1, 0, 0, 0])
elif i % 5 == 0:
y.append([0, 1, 0, 0])
elif i % 3 == 0:
y.append([0, 0, 1, 0])
else:
y.append([0, 0, 0, 1])
return training_test_gen(np.array(x), np.array(y))
def check_fizbuz(i):
if i % 15 == 0:
return 'fizbuz'
elif i % 5 == 0:
return 'buz'
elif i % 3 == 0:
return 'fiz'
else:
return 'number'
trX, trY, teX, teY = x_y_gen()
if th.cuda.is_available():
dtype = th.cuda.FloatTensor
else:
dtype = th.FloatTensor
x = Variable(th.from_numpy(trX).type(dtype), requires_grad=False)
y = Variable(th.from_numpy(trY).type(dtype), requires_grad=False)
w1 = Variable(th.randn(10, 100).type(dtype), requires_grad=True)
w2 = Variable(th.randn(100, 4).type(dtype), requires_grad=True)
b1 = Variable(th.zeros(1, 100).type(dtype), requires_grad=True)
b2 = Variable(th.zeros(1, 4).type(dtype), requires_grad=True)
no_of_batches = int(len(trX) / batches)
for epoch in range(epochs):
for batch in range(no_of_batches):
start = batch * batches
end = start + batches
x_ = x[start:end]
y_ = y[start:end]
a2 = x_.mm(w1)
a2 = a2.add(b1.expand_as(a2))
h2 = a2.sigmoid()
a3 = h2.mm(w2)
a3 = a3.add(b2.expand_as(a3))
hyp = a3.sigmoid()
error = hyp - y_
loss = error.pow(2).sum()
loss.backward()
w1.data -= lr * w1.grad.data
w2.data -= lr * w2.grad.data
b1.data -= lr * b1.grad.data
b2.data -= lr * b2.grad.data
w1.grad.data.zero_()
w2.grad.data.zero_()
print(epoch, error.mean().data[0])
Numpy Implementation:
import numpy as np
input_size = 10
epochs = 1000
batches = 64
lr = 0.01
def sig(val):
return 1 / (1 + np.exp(-val))
def sig_d(val):
sig_val = sig(val)
return sig_val * (1 - sig_val)
def binary_enc(num):
ret = [int(i) for i in '{0:b}'.format(num)]
return [0] * (input_size - len(ret)) + ret
def binary_dec(array):
ret = 0
for i in array:
ret = ret * 2 + int(i)
return ret
def training_test_gen(x, y):
assert len(x) == len(y)
indices = np.random.permutation(range(len(x)))
split_size = int(0.9 * len(indices))
trX = x[indices[:split_size]]
trY = y[indices[:split_size]]
teX = x[indices[split_size:]]
teY = y[indices[split_size:]]
return trX, trY, teX, teY
def x_y_gen():
x = []
y = []
for i in range(1000):
x.append(binary_enc(i))
if i % 15 == 0:
y.append([1, 0, 0, 0])
elif i % 5 == 0:
y.append([0, 1, 0, 0])
elif i % 3 == 0:
y.append([0, 0, 1, 0])
else:
y.append([0, 0, 0, 1])
return training_test_gen(np.array(x), np.array(y))
def check_fizbuz(i):
if i % 15 == 0:
return 'fizbuz'
elif i % 5 == 0:
return 'buz'
elif i % 3 == 0:
return 'fiz'
else:
return 'number'
trX, trY, teX, teY = x_y_gen()
w1 = np.random.randn(10, 100)
w2 = np.random.randn(100, 4)
b1 = np.zeros((1, 100))
b2 = np.zeros((1, 4))
no_of_batches = int(len(trX) / batches)
for epoch in range(epochs):
for batch in range(no_of_batches):
# forward
start = batch * batches
end = start + batches
x = trX[start:end]
y = trY[start:end]
a2 = x.dot(w1) + b1
h2 = sig(a2)
a3 = h2.dot(w2) + b2
hyp = sig(a3)
error = hyp - y
loss = (error ** 2).mean()
# backward
outerror = error
outgrad = outerror * sig_d(a3)
outdelta = h2.T.dot(outgrad)
outbiasdelta = np.ones([1, batches]).dot(outgrad)
hiddenerror = outerror.dot(w2.T)
hiddengrad = hiddenerror * sig_d(a2)
hiddendelta = x.T.dot(hiddengrad)
hiddenbiasdelta = np.ones([1, batches]).dot(hiddengrad)
w1 -= hiddendelta * lr
b1 -= hiddenbiasdelta * lr
w2 -= outdelta * lr
b2 -= outbiasdelta * lr
print(epoch, loss)
# test
a2 = teX.dot(w1) + b1
h2 = sig(a2)
a3 = h2.dot(w2) + b2
hyp = sig(a3)
outli = ['fizbuz', 'buz', 'fiz', 'number']
for i in range(len(teX)):
num = binary_dec(teX[i])
print(
'Number: {} -- Actual: {} -- Prediction: {}'.format(
num, check_fizbuz(num), outli[hyp[i].argmax()]))
print('Test loss: ', np.mean(teY - hyp))