Why is my numpy implementation 8x faster than pytorch forward pass?

I ran the profiler as you suggested. First I made the code easier to profile by splitting the pytorch and my numpy runs. See below.

Not sure what exactly you expected. Nothing jumps out to me other than everything looks slow. The times 13.7 and 8.44 don’t add up to 14.1 s. Not sure what to make of that.

import IPython as ipy
import numpy as np
import torch
import time

class Net(torch.nn.Module):
    def __init__(self, input_size, layer_width):
        super(Net, self).__init__()
        self.input_size = input_size
        output_size = 1
        self.fc1 = torch.nn.Linear(input_size, layer_width)
        self.fc2 = torch.nn.Linear(layer_width, layer_width)
        self.fc3 = torch.nn.Linear(layer_width, output_size)

    def forward(self, x):
        x = torch.nn.functional.relu(self.fc1(x))
        x = torch.nn.functional.relu(self.fc2(x))
        x = torch.exp(self.fc3(x))
        return x

    def get_input_size(self):
        return self.input_size

def pytorch_run(net, input_size, N_evals):
    t0 = time.time()
    pytorch_x = torch.ones(input_size)
    for i in range(N_evals):
        pytorch_answer = net(pytorch_x)
    t1 = time.time()
    return t1 - t0

def my_run(w1, w2, w3, b1, b2, b3, input_size, N_evals):
    t0 = time.time()
    x = np.ones(input_size)
    for i in range(N_evals):
        y = np.matmul(w1, x) + b1
        y = y * (y > 0)
        y = np.matmul(w2, y) + b2
        y = y * (y > 0)
        my_answer = np.exp(np.matmul(w3, y) + b3)
    t1 = time.time()
    return t1 - t0

def main():
    torch.set_default_dtype(torch.float32)
    torch.set_grad_enabled(False)
    input_size = 20
    layer_width = 20
    net = Net(input_size, layer_width)

    N_evals = 100000

    pytorch_x = torch.ones(input_size)
    pytorch_answer = net(pytorch_x).detach().numpy()
    print("pytorch answer = " + str(pytorch_answer))
    pytorch_time = pytorch_run(net, input_size, N_evals)
    print("pytorch takes " + str(pytorch_time) + " s")

    x = pytorch_x.detach().numpy()
    w1 = net.fc1.weight.detach().numpy()
    w2 = net.fc2.weight.detach().numpy()
    w3 = net.fc3.weight.detach().numpy()
    b1 = net.fc1.bias.detach().numpy()
    b2 = net.fc2.bias.detach().numpy()
    b3 = net.fc3.bias.detach().numpy()

    x = np.ones(input_size)
    y = np.matmul(w1, x) + b1
    y = y * (y > 0)
    y = np.matmul(w2, y) + b2
    y = y * (y > 0)
    my_answer = np.exp(np.matmul(w3, y) + b3)
    print("my answer = " + str(my_answer))

    my_time = my_run(w1, w2, w3, b1, b2, b3, input_size, N_evals)
    print("my imlementation takes " + str(my_time) + " s")


if __name__ == "__main__":
    main()