How to do backpropogation with Softmax and Mean Square Error?

I want to compare the losses for NumPy and PyTorch implementation with a softmax layer and mean square error loss. Here is my code for 2 hidden layer with final softmax layer and MSE loss.

import numpy as np
from copy import deepcopy
np.random.seed(99)

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 16, 100, 10, 2

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

xx = np.random.randn(D_in, H)
yy = np.random.randn(H, D_out)
w1 = deepcopy(xx)
w2 = deepcopy(yy)

def softmax(x):
    func = np.exp(x - np.max(x))
    return func / func.sum(axis=0)

learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y
    h = np.dot(x, w1)
    u = np.maximum(h, 0)

    z = np.dot(u, w2)
    y_pred = softmax(z)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    if t % 50 == 0:
        print("Epoch [{:3d}/{:3d}] Loss: {:.10f}".format(t, 500, loss))

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_z = y_pred * (1 - y_pred) * grad_y_pred
    grad_w2 = np.dot(u.T, grad_z)
    grad_u = grad_z.dot(w2.T)
    grad_h = grad_u.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

print()

import torch
import torch.nn as nn
from torch.autograd import Variable

m = nn.Softmax(dim=0)

x_ = torch.from_numpy(x).type(torch.float64)
y_ = torch.from_numpy(y).type(torch.float64)

w1_ = torch.from_numpy(deepcopy(xx)).type(torch.float64)
w2_ = torch.from_numpy(deepcopy(yy)).type(torch.float64)

w1_ = Variable(w1_, requires_grad=True)
w2_ = Variable(w2_, requires_grad=True)

for t in range(500):
    h_ = x_.mm(w1_)
    u_ = h_.clamp(min = 0)
    z = u_.mm(w2_)
    y_pred = m(z)

    loss_ = (y_pred - y_).pow(2).sum()
    if t % 50 == 0:
        print("Epoch [{:3d}/{:3d}] Loss: {:.10f}".format(t, 500, loss_.item()))
    loss_.backward()

    with torch.no_grad():
        w1_ -= learning_rate * w1_.grad
        w2_ -= learning_rate * w2_.grad
    
        w1_.grad.zero_()
        w2_.grad.zero_()

and here is the output.

Output for NumPy implementation:

Epoch [  0/500] Loss: 29.7599750339
Epoch [ 50/500] Loss: 29.7568945381
Epoch [100/500] Loss: 29.7530540477
Epoch [150/500] Loss: 29.7481408013
Epoch [200/500] Loss: 29.7416476795
Epoch [250/500] Loss: 29.7326987989
Epoch [300/500] Loss: 29.7196529397
Epoch [350/500] Loss: 29.6990826814
Epoch [400/500] Loss: 29.6626811106
Epoch [450/500] Loss: 29.5857445210

Output for PyTorch implementation:

Epoch [  0/500] Loss: 29.7599750339
Epoch [ 50/500] Loss: 29.7555138011
Epoch [100/500] Loss: 29.7493562321
Epoch [150/500] Loss: 29.7403440862
Epoch [200/500] Loss: 29.7260022571
Epoch [250/500] Loss: 29.7000586454
Epoch [300/500] Loss: 29.6418893830
Epoch [350/500] Loss: 29.4537482662
Epoch [400/500] Loss: 28.9717000115
Epoch [450/500] Loss: 28.8537886818

But I am not getting a similar output. Am I doing anything wrong while backpropagating in my NumPy implementation? Please help. Thank in advance.

Hey @ptrblck, Could you please help me with this question. Thanks!