How to make custom code in python utilize GPU while using tensors and matrice functions from Pytorch?

I’ve created a CNN from scratch only using Pytorch tensors and matrix operation functions in the hope of utilizing GPU. To my surprise, the GPU stays 0% utilized and my training doesn’t seem to be faster than running on my cpu.

import operator
from pathlib import Path
from IPython.core.debugger import set_trace
from fastai import datasets
import pickle, gzip, math, torch, matplotlib as mpl
import matplotlib.pyplot as plt
from torch import tensor, nn
import numpy as np
import pandas as pd
MNIST_URL='http://deeplearning.net/data/mnist/mnist.pkl'

class Module():
  def __call__(self, *args):
    self.args = args
    self.out = self.forward(*args)
    # print(self.out)
    return self.out
  
  def forward(self):
    raise Exception("Forward propagation isn't defined.")
  
  def backward(self):
    self.bwd(self.out, *self.args)

  def zero_grad(self):
    self.grad_zero(*self.args)

  def fit(self, learning_rate, adam_optimization, idx):
    self.update(learning_rate, adam_optimization, idx, *self.args)

class Model():
  def __init__(self, w1, b1, w2, b2, p1, w3, b3, w4, b4, w5, b5, w6, b6):
    # print(p1)
    # w1 (4, 4, 1, 12)
    # w2 (4, 4, 12, 24)
    # w3 (4, 4, 24, 48)
    # w4 (432, 50)
    # w5 (50, 10)
    self.layers = [
                   Conv(w1,b1,0,1,is_first_layer=True), # torch.Size([3, 25, 25, 12])
                   Relu(), # torch.Size([3, 25, 25, 12])
                   Pool(p1['filter_shape'], p1['padding'], p1['stride'], p1['pool_type']), # torch.Size([3, 23, 23, 12])
                   Conv(w2,b2,0,1), # torch.Size([3, 10, 10, 24])
                   Relu(), # torch.Size([3, 10, 10, 24])
                   Pool(p2['filter_shape'], p2['padding'], p2['stride'], p2['pool_type']), # torch.Size([3, 8, 8, 24]) 
                  #  Conv(w3,b3,0,1), # torch.Size([3, 3, 3, 48])
                  #  Relu(), # torch.Size([3, 3, 3, 48])
                   Lin(w4,b4), # torch.Size([3, 50])
                   Relu(), # torch.Size([3, 50])
                   Lin(w5,b5), # torch.Size([3, 10])
                   Relu(), # torch.Size([3, 10])
                   Lin(w6,b6), # torch.Size([3, 10])
                  #  Relu(), # torch.Size([3, 10])
                   Softmax()
                  ]
    self.loss = CEL()
  
  def __call__(self, x, targ):
    for l in self.layers: 
      x = l(x)
    return self.loss(x, targ)
  
  def backward(self):
    self.loss.backward()
    for l in reversed(self.layers): l.backward()
  
  def fit(self, learning_rate, adam_optimization=False):
    self.loss.fit(learning_rate, adam_optimization, len(self.layers))
    for idx in reversed(range(len(self.layers))): 
      self.layers[idx].fit(learning_rate, adam_optimization, idx)

  def zero_grad(self):
    self.loss.zero_grad()
    for l in reversed(self.layers): l.zero_grad()

  def predict(self, x_valid, y_valid):
    a = x_valid.cuda()
    for l in self.layers: 
      a = l(a)
    self.predictions = a.cuda()
    return self.predictions, self.loss(a, y_valid)

class Relu(Module):
  def forward(self, inp):
    return (inp).clamp_min(0) - 0.5

  def bwd(self, out, inp):
    # print(inp.shape, out.g.shape)
    diff_dim_out_g = out.g.cuda()
    if inp.shape != out.g.shape : diff_dim_out_g = out.g.view(inp.shape)
    inp.g = (inp.cuda() > 0).float() * diff_dim_out_g

  def update(self, learning_rate, adam_optimization, layerIndex, inp):
    inp -= inp.g * learning_rate
  
  def grad_zero(self, inp, *args):
    inp.g = inp.g * 0

class Lin(Module):
  def __init__(self, w, b, is_first_layer=False): 
    self.w, self.b = w, b
    self.is_first_layer = is_first_layer

  def forward(self, inp): 
    diff_dim_inp = inp
    if len(inp.shape) == 4 :
      diff_dim_inp = inp.view(inp.shape[0], inp.shape[1] * inp.shape[2] * inp.shape[3]).cuda()
    self.out = diff_dim_inp@self.w + self.b
    return self.out

  def bwd(self, out, inp):
    diff_dim_inp = inp
    if len(inp.shape) == 4 :
      diff_dim_inp = inp.view(inp.shape[0], inp.shape[1] * inp.shape[2] * inp.shape[3]).cuda()
    inp.g =  out.g @ self.w.t()
    # print((out.g @ self.w.t()).shape, inp.g.shape, '## lin inp.g')

    self.w.g = diff_dim_inp.t() @ out.g
    self.b.g = out.g.sum(0).sum(0).sum(0)
  
  def update(self, learning_rate, adam_optimization, layerIndex, inp):
    w_g = self.w.g
    b_g = self.b.g
    w = self.w
    b = self.b
    # print('its all up to you.', b_g, b_g.shape)
    if adam_optimization == True:
      self.w, self.b = update_parameters_with_adam(w, b, w_g, b_g, layerIndex, learning_rate )
    else:
      self.w -= self.w.g * learning_rate
      self.b -= self.b.g * learning_rate
    self.b.g = b_g
    self.w.g = w_g

    diff_inp_g = inp.g
    if self.is_first_layer != True :
      if inp.shape != inp.g.shape: diff_inp_g = inp.g.view(inp.shape).cuda()
      inp -= diff_inp_g * learning_rate

  def grad_zero(self, inp, *args):
    # print(self.b.shape)  # PROBLEM IS HERE! :)
    inp.g = inp.g * 0
    self.w.g = self.w.g * 0
    self.b.g = self.b.g * 0

class Conv(Module):
  def __init__(self, w, b, padding, stride, is_first_layer=False):
    self.w, self.b, self.padding, self.stride = w, b, padding, stride
    self.is_first_layer = is_first_layer

  def forward(self, inp): 
    self.out = conv_forward(inp, self.w, self.b, self.padding, self.stride)
    return self.out
  
  def bwd(self, out, inp):
    inp.g, self.w.g, self.b.g = conv_backward(inp, out, self.w, self.b, self.padding, self.stride)
    # print('This is gross ;)', self.b.g.shape)

  def update(self, learning_rate, adam_optimization, layerIndex, inp):
    w_g = self.w.g
    b_g = self.b.g
    # print('how does it make it well ?', b_g.shape, b_g)
    if adam_optimization == True:
      self.w, self.b = update_parameters_with_adam(self.w, self.b, w_g, b_g, layerIndex, learning_rate)
    else:
      self.w -= self.w.g * learning_rate
      self.b -= self.b.g * learning_rate
    self.w.g = w_g
    self.b.g = b_g
    if self.is_first_layer != True : inp -= inp.g * learning_rate
  
  def grad_zero(self, inp, *args):
    # print('hit me', self.w.g.shape, self.b.g.shape)
    inp.g = inp.g * 0
    self.w.g = self.w.g * 0
    self.b.g = self.b.g * 0

class Softmax(Module):
  def forward(self, inp):
    # print('some', inp.shape, inp.max(1).values.shape)
    inp = inp.cpu()
    e_x = np.exp(inp - inp.max(1).values.unsqueeze(-1))
    self.out = (e_x / e_x.sum(1).unsqueeze(-1)).cuda()
    # print('prediction: ', self.out)
    return self.out
  
  def bwd(self, out, inp):
    # inp.g = (out * (1 - out)).cuda()
    s = torch.reshape(out,(-1,1))
    # print(out.shape, s.shape, s.transpose(0,1).shape, out.g.shape, torch.diagflat(s).shape, torch.matmul(s, s.transpose(0,1)).shape)
    # print((torch.diagflat(s) - torch.matmul(s, s.transpose(0,1))).shape, torch.reshape(out.g, (-1,1)).shape, torch.matmul((torch.diagflat(s) - torch.matmul(s, s.transpose(0,1))), torch.reshape(out.g, (-1,1))).shape)
    
    # torch.Size([200, 1]) torch.Size([1, 200]) torch.Size([20, 10]) torch.Size([200, 200])
    # print(out.shape, y.shape, (out - y).shape)
    inp.g = out.cuda() - y.cuda()
  
  def update(self, learning_rate, adam_optimization, layerIndex, inp):
    # print(inp.shape, inp.shape)
    inp -= inp.g * learning_rate

  def grad_zero(self, inp, *args):
    inp.g = inp.g * 0

class Mse(Module):
  def forward(self, inp, targ):  
    self.out = ((inp.squeeze(-1) - targ).pow(2).mean()).cuda()
    return self.out

  def bwd(self, out, inp, targ):  
    inp.g = 2 * (inp - targ) / targ.shape[0]

    # w_plus = w + epsilon                               
    # w_minus = w - epsilon                              
    # J_plus = self.forward(x,w_plus,b)                                  
    # J_minus = self.forward(x,w_minus,b)                                 
    # gradapprox = (J_plus - J_minus) / (2 * epsilon) 

    # numerator = np.linalg.norm(grad_w - gradapprox)                               
    # denominator = np.linalg.norm(grad_w) + np.linalg.norm(gradapprox)                            
    # difference = numerator/denominator
    # return difference

  def update(self, learning_rate, adam_optimization, layerIndex, inp, *args):
    inp -= inp.g * learning_rate  
  
  def grad_zero(self, inp, *args):
    inp.g = inp.g * 0

class CEL(Module):
  def forward(self, inp, targ):  
    self.out = (- (1/targ.shape[0]) * ( targ.cuda() * (torch.log(inp.cuda())) ).sum()).cuda()
    return self.out

  def bwd(self, out, inp, targ):  
    inp.g = (- targ.cuda()/inp.cuda()).cuda()

  def update(self, learning_rate, adam_optimization, layerIndex, inp, *args):
    inp -= inp.g * learning_rate  
  
  def grad_zero(self, inp, *args):
    inp.g = inp.g * 0

def conv_forward(inp, weight, bias, padding=0, stride=1, rotate=0):
  m, n_h, n_w, n_c = inp.shape
  f, f, n_c, n_f = weight.shape

  assert(f <= n_h and f <= n_w)

  h = int ( ((n_h - f + (2 * padding)) / stride ) ) + 1
  w = int ( ((n_w - f + (2 * padding)) / stride ) ) + 1

  out = torch.zeros((m, h, w, n_f)).cuda()

  for i in range(m):
    for j in range(h):
      for k in range(w):
        for c in range(n_f):
          vert_start, horiz_start = j * stride, k * stride
          vert_end = vert_start + f
          horiz_end = horiz_start + f
          # for l in range(n_f):
          inp_slice = inp[i, vert_start:vert_end, horiz_start:horiz_end, :].cuda()
          # print('look into it', weight[:,:,:,c].shape, inp_slice.shape, bias[:,:,:,c].shape)

          s = (weight[:,:,:,c].cuda() * inp_slice).sum() + bias[:,:,:,c].cuda()  # unsqueeze to enable broadcasting
          # print('s', s.shape, inp_slice.shape, weight.shape, vert_start,vert_end, horiz_start,horiz_end)
          out[i, j, k, c] = s

  return out

def pad(tensor, number):
  pd = (0, 0, number, number, number, number, 0, 0) # pad last dim by 1 on each side
  out = torch.nn.functional.pad(tensor, pd, "constant", 0)
  return out

def conv_backward(inp, out, weight, bias, padding=0, stride=1):
  inp.g = torch.zeros(inp.shape).cuda()
  bias.g = torch.zeros(bias.shape).cuda()
  weight.g = torch.zeros(weight.shape).cuda()
  
  m, n_h, n_w, c = inp.shape # torch.Size([3, 23, 23, 12])
  m, h, w, out_c = out.g.shape # torch.Size([3, 10, 10, 24])
    
  f, f, n_c, n_f = weight.shape # torch.Size([4, 4, 12, 24])
    
  inp_pad = pad(inp, padding)
  inp_g_pad = pad(inp.g, padding)

  for i in range(m):
    inp_slice = inp_pad[i]
    inp_g_slice = inp_g_pad[i]
    for j in range(h):
      for k in range(w):
        vert_start = j 
        vert_end = vert_start + f
        horiz_start = k  
        horiz_end = horiz_start + f 
        for l in range(out_c):
          out_g_slice = out.g[i, j, k, l]
          weight_slice = weight[:,:,:,l]
          inp_g_slice[vert_start:vert_end, horiz_start:horiz_end, :] += (weight_slice * out_g_slice)  # issue: 5x5 * 5x4 
          weight.g[:,:,:,l] += inp_slice[vert_start:vert_end, horiz_start:horiz_end, :].cuda() * out_g_slice.cuda()
          bias.g[:, :, :, l] += out_g_slice
    if padding != 0:
      inp.g[i,:,:,:] = inp_g_slice[padding:-padding, padding:-padding, :]
    else:
      inp.g[i,:,:,:] = inp_g_slice

  # print('just look at me!', bias.g.shape)

  return inp.g, weight.g, bias.g

class Pool(Module):
  def __init__(self, filter_shape, padding, stride, pool_type):
    self.filter_shape, self.padding, self.stride, self.pool_type = filter_shape, padding, stride, pool_type

  def forward(self, inp):
    if self.pool_type == "max-pooling" :
      self.out = max_pooling_forward(inp, self.filter_shape, self.padding, self.stride)
      return self.out
    elif self.pool_type == 'average-pooling':
      ''

  def bwd(self, out, inp):
    if self.pool_type == "max-pooling":
      inp.g = max_pooling_backward(out, inp, self.filter_shape, self.padding, self.stride)
    elif self.pool_type == 'average-pooling':
      ''

  def update(self, learning_rate, adam_optimization, layerIndex, inp):
    inp -= inp.g * learning_rate       
  
  def grad_zero(self, inp, *args):
    inp.g = inp.g * 0

def max_pooling_forward(inp, filter_shape, padding, stride):
  f = filter_shape
  m, n_h, n_w, n_c = inp.shape
  

  h = int ((n_h - f + (2 * padding))/stride) + 1
  w = int ((n_w - f + (2 * padding))/stride) + 1
  out = torch.zeros(m, h, w, n_c).cuda()
  # print(inp.shape,filter_shape,padding,stride, out.shape)
  for i in range(m):
    for j in range(h):
      for k in range(w):
        vert_start = j * stride
        vert_end = vert_start + f
        horiz_start = k * stride
        horiz_end = horiz_start + f
        # print('yepudi', k, stride, f, horiz_start, horiz_end)
        for l in range(n_c):
          inp_slice = inp[i, vert_start:vert_end, horiz_start:horiz_end, l] # shape -> fxf
          # print(inp.shape, vert_start, vert_end, horiz_start, horiz_end )
          out[i, j, k, l] = torch.max(inp_slice)
  return out

def max_pooling_backward(out, inp, filter_shape, padding, stride):
  f = filter_shape
  m, n_h_inp, n_w_inp, n_c_inp = inp.shape
  m, n_h_out, n_w_out, n_c_out = out.shape

  diff_dim_out_g = out.g.cuda()
  if out.shape != out.g.shape: diff_dim_out_g = out.g.view(out.shape)
  
  # h = int ((n_h - f + (2 * padding))/stride) + 1
  # w = int ((n_w - f + (2 * padding))/stride) + 1

  # n_c_inp.shape == n_c_out.shape will be True

  inp.g = torch.zeros(inp.shape).cuda()

  for i in range(m): 
    for j in range(n_h_out):
      for k in range(n_w_out):
        vert_start = j * stride
        vert_end = vert_start + f
        horiz_start = k * stride
        horiz_end = horiz_start + f
        for l in range(n_c_out):
          inp_slice = inp[i, vert_start:vert_end, horiz_start:horiz_end, l] # shape -> fxf
          inp.g[i, vert_start:vert_end, horiz_start:horiz_end, l] += (inp_slice == torch.max(inp_slice)).to(torch.float32) * diff_dim_out_g[i, j, k, l]
  
  assert(inp.shape == inp.g.shape)
  return inp.g

def get_data():
    path = datasets.download_data(MNIST_URL, ext='.gz')
    with gzip.open(path, 'rb') as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
    return map(tensor, (x_train,y_train,x_valid,y_valid))

def normalize(x, m, s): return (x-m)/s

x_train,y_train,x_valid,y_valid = get_data()

train_mean,train_std = x_train[:10].mean(),x_train[:10].std()
train_mean,train_std

x_train = normalize(x_train, train_mean, train_std)
x_train = x_train[:5000]
y_train = y_train[:5000]

x_train = x_train.clone().detach().view(x_train.shape[0], 28, 28, 1)
y_zeros = torch.zeros(y_train.shape[0], 10)

x_valid = x_valid[:500]
y_valid = y_valid[:500]

x_valid = x_valid.clone().detach().view(x_valid.shape[0], 28, 28, 1)
y_zeros_valid = torch.zeros(y_valid.shape[0], 10)

print(y_valid.shape)
for i in range(y_zeros.shape[0]):
  y_zeros[i,y_train[i]] = 1
y_train = y_zeros

for i in range(y_zeros_valid.shape[0]):
  y_zeros_valid[i,y_valid[i]] = 1
# print(y_zeros)
y_valid = y_zeros_valid

# w1 = torch.randn(5, 5, 1, 32).cuda() * math.sqrt(2/(5*5*1))
# b1 = torch.zeros(1, 1, 1, 32).cuda() 
# w2 = torch.randn(5, 5, 32, 64).cuda() * math.sqrt(2/(5*5*32))
# b2 = torch.zeros(1, 1, 1, 64).cuda() 
# w3 = torch.randn(5, 5, 64, 32).cuda() * math.sqrt(2/(5*5*64))
# b3 = torch.zeros(1, 1, 1, 32).cuda()
# w4 = torch.randn(3136, 1024).cuda() * math.sqrt(2/3136)
# b4 = torch.zeros(1024).cuda()
# w5 = torch.randn(1024, 256).cuda() * math.sqrt(2/1024)
# b5 = torch.zeros(256).cuda()
# w6 = torch.randn(256, 10).cuda() * math.sqrt(2/256)
# b6 = torch.zeros(10).cuda()

w1 = torch.load('w1.pt').cuda()
w2 = torch.load('w2.pt').cuda()
w3 = torch.load('w3.pt').cuda()
w4 = torch.load('w4.pt').cuda()
w5 = torch.load('w5.pt').cuda()
w6 = torch.load('w6.pt').cuda()
b1 = torch.load('b1.pt').cuda()
b2 = torch.load('b2.pt').cuda()
b3 = torch.load('b3.pt').cuda()
b4 = torch.load('b4.pt').cuda()
b5 = torch.load('b5.pt').cuda()
b6 = torch.load('b6.pt').cuda()

p1 = {'filter_shape': 2, 'stride': 2, 'padding': 0, 'pool_type': 'max-pooling'}

p2 = {'filter_shape': 2, 'stride': 1, 'padding': 0, 'pool_type': 'max-pooling'}

model = Model(w1, b1, w2, b2, p1, w3, b3, w4, b4, w5, b5, w6, b6)

epochs = 10
cost_list = []
batch_size = 100
total_batches = int(x_train.shape[0] / batch_size)
print(total_batches)

for j in range(epochs):
  print('epoch: ', j)
  cost_total = 0
  for i in range(total_batches):
    x = x_train[i*batch_size:(i*batch_size)+batch_size]
    y = y_train[i*batch_size:(i*batch_size)+batch_size]
    # print("MAIN", x_train.mean(), x_train.std(), y_train.mean(), y_train.std())
    loss = model(x, y)
    print("loss: ", loss)
    cost_total += loss 
    cost_list.append(loss)
    model.backward()
    model.fit(learning_rate=0.0001, adam_optimization=False)
    model.zero_grad()
  # print('y_train: ', y_train)
  cost_avg = cost_total/batch_size
  cost_list.append(cost_avg)
  print('cost_avg: ', cost_avg)

predict_loss = model.predict(x_valid, y_valid)
print('predict loss : ', predict_loss)

Graphics card: Nvidia GEFORCE 2070 SUPER

Processor: Intel i5 10400

Coding Environment: Jupyter Notebook

Cuda & Cudnn Version: 11.0

Pytorch version: 1.6.0

Stackoverflow version: https://stackoverflow.com/questions/63479920/how-to-make-custom-code-in-python-utilize-gpu-while-using-pytorch-tensors-and-ma/63483212?noredirect=1#comment112264077_63483212

I ran your code, it did use my gpu (gtx 960m), but the utilization percentage remained low (10-15%). I use pytorch version 1.6.0 with cuda 10.2.

@user_123454321
Oh yeah… That was the issue here which I strive to fix. By the way, I’m new to Pytorch and it will be helpful to know whether is it normal to code like what I did (or) is there any other way to write custom Pytorch classes for Convolution, Relu, etc which utilizes GPU as native Pytorch classes do?

You have used 4 for-loops in your convolution algorithm. I guess it negates any performance you can achieve with GPU utilization. This fastai tutorial might be helpful I guess. Notice, how when they reduce the loops the speed increases dramatically.

1 Like

Thanks for pointing that out! I’ll look into it.