I’ve created a CNN from scratch only using Pytorch tensors and matrix operation functions in the hope of utilizing GPU. To my surprise, the GPU stays 0% utilized and my training doesn’t seem to be faster than running on my cpu.
import operator
from pathlib import Path
from IPython.core.debugger import set_trace
from fastai import datasets
import pickle, gzip, math, torch, matplotlib as mpl
import matplotlib.pyplot as plt
from torch import tensor, nn
import numpy as np
import pandas as pd
MNIST_URL='http://deeplearning.net/data/mnist/mnist.pkl'
class Module():
def __call__(self, *args):
self.args = args
self.out = self.forward(*args)
# print(self.out)
return self.out
def forward(self):
raise Exception("Forward propagation isn't defined.")
def backward(self):
self.bwd(self.out, *self.args)
def zero_grad(self):
self.grad_zero(*self.args)
def fit(self, learning_rate, adam_optimization, idx):
self.update(learning_rate, adam_optimization, idx, *self.args)
class Model():
def __init__(self, w1, b1, w2, b2, p1, w3, b3, w4, b4, w5, b5, w6, b6):
# print(p1)
# w1 (4, 4, 1, 12)
# w2 (4, 4, 12, 24)
# w3 (4, 4, 24, 48)
# w4 (432, 50)
# w5 (50, 10)
self.layers = [
Conv(w1,b1,0,1,is_first_layer=True), # torch.Size([3, 25, 25, 12])
Relu(), # torch.Size([3, 25, 25, 12])
Pool(p1['filter_shape'], p1['padding'], p1['stride'], p1['pool_type']), # torch.Size([3, 23, 23, 12])
Conv(w2,b2,0,1), # torch.Size([3, 10, 10, 24])
Relu(), # torch.Size([3, 10, 10, 24])
Pool(p2['filter_shape'], p2['padding'], p2['stride'], p2['pool_type']), # torch.Size([3, 8, 8, 24])
# Conv(w3,b3,0,1), # torch.Size([3, 3, 3, 48])
# Relu(), # torch.Size([3, 3, 3, 48])
Lin(w4,b4), # torch.Size([3, 50])
Relu(), # torch.Size([3, 50])
Lin(w5,b5), # torch.Size([3, 10])
Relu(), # torch.Size([3, 10])
Lin(w6,b6), # torch.Size([3, 10])
# Relu(), # torch.Size([3, 10])
Softmax()
]
self.loss = CEL()
def __call__(self, x, targ):
for l in self.layers:
x = l(x)
return self.loss(x, targ)
def backward(self):
self.loss.backward()
for l in reversed(self.layers): l.backward()
def fit(self, learning_rate, adam_optimization=False):
self.loss.fit(learning_rate, adam_optimization, len(self.layers))
for idx in reversed(range(len(self.layers))):
self.layers[idx].fit(learning_rate, adam_optimization, idx)
def zero_grad(self):
self.loss.zero_grad()
for l in reversed(self.layers): l.zero_grad()
def predict(self, x_valid, y_valid):
a = x_valid.cuda()
for l in self.layers:
a = l(a)
self.predictions = a.cuda()
return self.predictions, self.loss(a, y_valid)
class Relu(Module):
def forward(self, inp):
return (inp).clamp_min(0) - 0.5
def bwd(self, out, inp):
# print(inp.shape, out.g.shape)
diff_dim_out_g = out.g.cuda()
if inp.shape != out.g.shape : diff_dim_out_g = out.g.view(inp.shape)
inp.g = (inp.cuda() > 0).float() * diff_dim_out_g
def update(self, learning_rate, adam_optimization, layerIndex, inp):
inp -= inp.g * learning_rate
def grad_zero(self, inp, *args):
inp.g = inp.g * 0
class Lin(Module):
def __init__(self, w, b, is_first_layer=False):
self.w, self.b = w, b
self.is_first_layer = is_first_layer
def forward(self, inp):
diff_dim_inp = inp
if len(inp.shape) == 4 :
diff_dim_inp = inp.view(inp.shape[0], inp.shape[1] * inp.shape[2] * inp.shape[3]).cuda()
self.out = diff_dim_inp@self.w + self.b
return self.out
def bwd(self, out, inp):
diff_dim_inp = inp
if len(inp.shape) == 4 :
diff_dim_inp = inp.view(inp.shape[0], inp.shape[1] * inp.shape[2] * inp.shape[3]).cuda()
inp.g = out.g @ self.w.t()
# print((out.g @ self.w.t()).shape, inp.g.shape, '## lin inp.g')
self.w.g = diff_dim_inp.t() @ out.g
self.b.g = out.g.sum(0).sum(0).sum(0)
def update(self, learning_rate, adam_optimization, layerIndex, inp):
w_g = self.w.g
b_g = self.b.g
w = self.w
b = self.b
# print('its all up to you.', b_g, b_g.shape)
if adam_optimization == True:
self.w, self.b = update_parameters_with_adam(w, b, w_g, b_g, layerIndex, learning_rate )
else:
self.w -= self.w.g * learning_rate
self.b -= self.b.g * learning_rate
self.b.g = b_g
self.w.g = w_g
diff_inp_g = inp.g
if self.is_first_layer != True :
if inp.shape != inp.g.shape: diff_inp_g = inp.g.view(inp.shape).cuda()
inp -= diff_inp_g * learning_rate
def grad_zero(self, inp, *args):
# print(self.b.shape) # PROBLEM IS HERE! :)
inp.g = inp.g * 0
self.w.g = self.w.g * 0
self.b.g = self.b.g * 0
class Conv(Module):
def __init__(self, w, b, padding, stride, is_first_layer=False):
self.w, self.b, self.padding, self.stride = w, b, padding, stride
self.is_first_layer = is_first_layer
def forward(self, inp):
self.out = conv_forward(inp, self.w, self.b, self.padding, self.stride)
return self.out
def bwd(self, out, inp):
inp.g, self.w.g, self.b.g = conv_backward(inp, out, self.w, self.b, self.padding, self.stride)
# print('This is gross ;)', self.b.g.shape)
def update(self, learning_rate, adam_optimization, layerIndex, inp):
w_g = self.w.g
b_g = self.b.g
# print('how does it make it well ?', b_g.shape, b_g)
if adam_optimization == True:
self.w, self.b = update_parameters_with_adam(self.w, self.b, w_g, b_g, layerIndex, learning_rate)
else:
self.w -= self.w.g * learning_rate
self.b -= self.b.g * learning_rate
self.w.g = w_g
self.b.g = b_g
if self.is_first_layer != True : inp -= inp.g * learning_rate
def grad_zero(self, inp, *args):
# print('hit me', self.w.g.shape, self.b.g.shape)
inp.g = inp.g * 0
self.w.g = self.w.g * 0
self.b.g = self.b.g * 0
class Softmax(Module):
def forward(self, inp):
# print('some', inp.shape, inp.max(1).values.shape)
inp = inp.cpu()
e_x = np.exp(inp - inp.max(1).values.unsqueeze(-1))
self.out = (e_x / e_x.sum(1).unsqueeze(-1)).cuda()
# print('prediction: ', self.out)
return self.out
def bwd(self, out, inp):
# inp.g = (out * (1 - out)).cuda()
s = torch.reshape(out,(-1,1))
# print(out.shape, s.shape, s.transpose(0,1).shape, out.g.shape, torch.diagflat(s).shape, torch.matmul(s, s.transpose(0,1)).shape)
# print((torch.diagflat(s) - torch.matmul(s, s.transpose(0,1))).shape, torch.reshape(out.g, (-1,1)).shape, torch.matmul((torch.diagflat(s) - torch.matmul(s, s.transpose(0,1))), torch.reshape(out.g, (-1,1))).shape)
# torch.Size([200, 1]) torch.Size([1, 200]) torch.Size([20, 10]) torch.Size([200, 200])
# print(out.shape, y.shape, (out - y).shape)
inp.g = out.cuda() - y.cuda()
def update(self, learning_rate, adam_optimization, layerIndex, inp):
# print(inp.shape, inp.shape)
inp -= inp.g * learning_rate
def grad_zero(self, inp, *args):
inp.g = inp.g * 0
class Mse(Module):
def forward(self, inp, targ):
self.out = ((inp.squeeze(-1) - targ).pow(2).mean()).cuda()
return self.out
def bwd(self, out, inp, targ):
inp.g = 2 * (inp - targ) / targ.shape[0]
# w_plus = w + epsilon
# w_minus = w - epsilon
# J_plus = self.forward(x,w_plus,b)
# J_minus = self.forward(x,w_minus,b)
# gradapprox = (J_plus - J_minus) / (2 * epsilon)
# numerator = np.linalg.norm(grad_w - gradapprox)
# denominator = np.linalg.norm(grad_w) + np.linalg.norm(gradapprox)
# difference = numerator/denominator
# return difference
def update(self, learning_rate, adam_optimization, layerIndex, inp, *args):
inp -= inp.g * learning_rate
def grad_zero(self, inp, *args):
inp.g = inp.g * 0
class CEL(Module):
def forward(self, inp, targ):
self.out = (- (1/targ.shape[0]) * ( targ.cuda() * (torch.log(inp.cuda())) ).sum()).cuda()
return self.out
def bwd(self, out, inp, targ):
inp.g = (- targ.cuda()/inp.cuda()).cuda()
def update(self, learning_rate, adam_optimization, layerIndex, inp, *args):
inp -= inp.g * learning_rate
def grad_zero(self, inp, *args):
inp.g = inp.g * 0
def conv_forward(inp, weight, bias, padding=0, stride=1, rotate=0):
m, n_h, n_w, n_c = inp.shape
f, f, n_c, n_f = weight.shape
assert(f <= n_h and f <= n_w)
h = int ( ((n_h - f + (2 * padding)) / stride ) ) + 1
w = int ( ((n_w - f + (2 * padding)) / stride ) ) + 1
out = torch.zeros((m, h, w, n_f)).cuda()
for i in range(m):
for j in range(h):
for k in range(w):
for c in range(n_f):
vert_start, horiz_start = j * stride, k * stride
vert_end = vert_start + f
horiz_end = horiz_start + f
# for l in range(n_f):
inp_slice = inp[i, vert_start:vert_end, horiz_start:horiz_end, :].cuda()
# print('look into it', weight[:,:,:,c].shape, inp_slice.shape, bias[:,:,:,c].shape)
s = (weight[:,:,:,c].cuda() * inp_slice).sum() + bias[:,:,:,c].cuda() # unsqueeze to enable broadcasting
# print('s', s.shape, inp_slice.shape, weight.shape, vert_start,vert_end, horiz_start,horiz_end)
out[i, j, k, c] = s
return out
def pad(tensor, number):
pd = (0, 0, number, number, number, number, 0, 0) # pad last dim by 1 on each side
out = torch.nn.functional.pad(tensor, pd, "constant", 0)
return out
def conv_backward(inp, out, weight, bias, padding=0, stride=1):
inp.g = torch.zeros(inp.shape).cuda()
bias.g = torch.zeros(bias.shape).cuda()
weight.g = torch.zeros(weight.shape).cuda()
m, n_h, n_w, c = inp.shape # torch.Size([3, 23, 23, 12])
m, h, w, out_c = out.g.shape # torch.Size([3, 10, 10, 24])
f, f, n_c, n_f = weight.shape # torch.Size([4, 4, 12, 24])
inp_pad = pad(inp, padding)
inp_g_pad = pad(inp.g, padding)
for i in range(m):
inp_slice = inp_pad[i]
inp_g_slice = inp_g_pad[i]
for j in range(h):
for k in range(w):
vert_start = j
vert_end = vert_start + f
horiz_start = k
horiz_end = horiz_start + f
for l in range(out_c):
out_g_slice = out.g[i, j, k, l]
weight_slice = weight[:,:,:,l]
inp_g_slice[vert_start:vert_end, horiz_start:horiz_end, :] += (weight_slice * out_g_slice) # issue: 5x5 * 5x4
weight.g[:,:,:,l] += inp_slice[vert_start:vert_end, horiz_start:horiz_end, :].cuda() * out_g_slice.cuda()
bias.g[:, :, :, l] += out_g_slice
if padding != 0:
inp.g[i,:,:,:] = inp_g_slice[padding:-padding, padding:-padding, :]
else:
inp.g[i,:,:,:] = inp_g_slice
# print('just look at me!', bias.g.shape)
return inp.g, weight.g, bias.g
class Pool(Module):
def __init__(self, filter_shape, padding, stride, pool_type):
self.filter_shape, self.padding, self.stride, self.pool_type = filter_shape, padding, stride, pool_type
def forward(self, inp):
if self.pool_type == "max-pooling" :
self.out = max_pooling_forward(inp, self.filter_shape, self.padding, self.stride)
return self.out
elif self.pool_type == 'average-pooling':
''
def bwd(self, out, inp):
if self.pool_type == "max-pooling":
inp.g = max_pooling_backward(out, inp, self.filter_shape, self.padding, self.stride)
elif self.pool_type == 'average-pooling':
''
def update(self, learning_rate, adam_optimization, layerIndex, inp):
inp -= inp.g * learning_rate
def grad_zero(self, inp, *args):
inp.g = inp.g * 0
def max_pooling_forward(inp, filter_shape, padding, stride):
f = filter_shape
m, n_h, n_w, n_c = inp.shape
h = int ((n_h - f + (2 * padding))/stride) + 1
w = int ((n_w - f + (2 * padding))/stride) + 1
out = torch.zeros(m, h, w, n_c).cuda()
# print(inp.shape,filter_shape,padding,stride, out.shape)
for i in range(m):
for j in range(h):
for k in range(w):
vert_start = j * stride
vert_end = vert_start + f
horiz_start = k * stride
horiz_end = horiz_start + f
# print('yepudi', k, stride, f, horiz_start, horiz_end)
for l in range(n_c):
inp_slice = inp[i, vert_start:vert_end, horiz_start:horiz_end, l] # shape -> fxf
# print(inp.shape, vert_start, vert_end, horiz_start, horiz_end )
out[i, j, k, l] = torch.max(inp_slice)
return out
def max_pooling_backward(out, inp, filter_shape, padding, stride):
f = filter_shape
m, n_h_inp, n_w_inp, n_c_inp = inp.shape
m, n_h_out, n_w_out, n_c_out = out.shape
diff_dim_out_g = out.g.cuda()
if out.shape != out.g.shape: diff_dim_out_g = out.g.view(out.shape)
# h = int ((n_h - f + (2 * padding))/stride) + 1
# w = int ((n_w - f + (2 * padding))/stride) + 1
# n_c_inp.shape == n_c_out.shape will be True
inp.g = torch.zeros(inp.shape).cuda()
for i in range(m):
for j in range(n_h_out):
for k in range(n_w_out):
vert_start = j * stride
vert_end = vert_start + f
horiz_start = k * stride
horiz_end = horiz_start + f
for l in range(n_c_out):
inp_slice = inp[i, vert_start:vert_end, horiz_start:horiz_end, l] # shape -> fxf
inp.g[i, vert_start:vert_end, horiz_start:horiz_end, l] += (inp_slice == torch.max(inp_slice)).to(torch.float32) * diff_dim_out_g[i, j, k, l]
assert(inp.shape == inp.g.shape)
return inp.g
def get_data():
path = datasets.download_data(MNIST_URL, ext='.gz')
with gzip.open(path, 'rb') as f:
((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
return map(tensor, (x_train,y_train,x_valid,y_valid))
def normalize(x, m, s): return (x-m)/s
x_train,y_train,x_valid,y_valid = get_data()
train_mean,train_std = x_train[:10].mean(),x_train[:10].std()
train_mean,train_std
x_train = normalize(x_train, train_mean, train_std)
x_train = x_train[:5000]
y_train = y_train[:5000]
x_train = x_train.clone().detach().view(x_train.shape[0], 28, 28, 1)
y_zeros = torch.zeros(y_train.shape[0], 10)
x_valid = x_valid[:500]
y_valid = y_valid[:500]
x_valid = x_valid.clone().detach().view(x_valid.shape[0], 28, 28, 1)
y_zeros_valid = torch.zeros(y_valid.shape[0], 10)
print(y_valid.shape)
for i in range(y_zeros.shape[0]):
y_zeros[i,y_train[i]] = 1
y_train = y_zeros
for i in range(y_zeros_valid.shape[0]):
y_zeros_valid[i,y_valid[i]] = 1
# print(y_zeros)
y_valid = y_zeros_valid
# w1 = torch.randn(5, 5, 1, 32).cuda() * math.sqrt(2/(5*5*1))
# b1 = torch.zeros(1, 1, 1, 32).cuda()
# w2 = torch.randn(5, 5, 32, 64).cuda() * math.sqrt(2/(5*5*32))
# b2 = torch.zeros(1, 1, 1, 64).cuda()
# w3 = torch.randn(5, 5, 64, 32).cuda() * math.sqrt(2/(5*5*64))
# b3 = torch.zeros(1, 1, 1, 32).cuda()
# w4 = torch.randn(3136, 1024).cuda() * math.sqrt(2/3136)
# b4 = torch.zeros(1024).cuda()
# w5 = torch.randn(1024, 256).cuda() * math.sqrt(2/1024)
# b5 = torch.zeros(256).cuda()
# w6 = torch.randn(256, 10).cuda() * math.sqrt(2/256)
# b6 = torch.zeros(10).cuda()
w1 = torch.load('w1.pt').cuda()
w2 = torch.load('w2.pt').cuda()
w3 = torch.load('w3.pt').cuda()
w4 = torch.load('w4.pt').cuda()
w5 = torch.load('w5.pt').cuda()
w6 = torch.load('w6.pt').cuda()
b1 = torch.load('b1.pt').cuda()
b2 = torch.load('b2.pt').cuda()
b3 = torch.load('b3.pt').cuda()
b4 = torch.load('b4.pt').cuda()
b5 = torch.load('b5.pt').cuda()
b6 = torch.load('b6.pt').cuda()
p1 = {'filter_shape': 2, 'stride': 2, 'padding': 0, 'pool_type': 'max-pooling'}
p2 = {'filter_shape': 2, 'stride': 1, 'padding': 0, 'pool_type': 'max-pooling'}
model = Model(w1, b1, w2, b2, p1, w3, b3, w4, b4, w5, b5, w6, b6)
epochs = 10
cost_list = []
batch_size = 100
total_batches = int(x_train.shape[0] / batch_size)
print(total_batches)
for j in range(epochs):
print('epoch: ', j)
cost_total = 0
for i in range(total_batches):
x = x_train[i*batch_size:(i*batch_size)+batch_size]
y = y_train[i*batch_size:(i*batch_size)+batch_size]
# print("MAIN", x_train.mean(), x_train.std(), y_train.mean(), y_train.std())
loss = model(x, y)
print("loss: ", loss)
cost_total += loss
cost_list.append(loss)
model.backward()
model.fit(learning_rate=0.0001, adam_optimization=False)
model.zero_grad()
# print('y_train: ', y_train)
cost_avg = cost_total/batch_size
cost_list.append(cost_avg)
print('cost_avg: ', cost_avg)
predict_loss = model.predict(x_valid, y_valid)
print('predict loss : ', predict_loss)
Graphics card: Nvidia GEFORCE 2070 SUPER
Processor: Intel i5 10400
Coding Environment: Jupyter Notebook
Cuda & Cudnn Version: 11.0
Pytorch version: 1.6.0
Stackoverflow version: https://stackoverflow.com/questions/63479920/how-to-make-custom-code-in-python-utilize-gpu-while-using-pytorch-tensors-and-ma/63483212?noredirect=1#comment112264077_63483212