Working code breaks when run on Google Colab

I thought so (which I suppose is good news, once I find out what’s killing it). The relevant sections of the code – a simple feed-forward random network trained on MNIST – are below, starting with the following imports:

# PyTorch packages:
import torch
import torch.nn as nn                       # neural net package
import torch.nn.functional as F             # useful functions, e.g., convolutions & loss functions
from torch import optim                     # optimizers (torch.optim)
from torch.utils.data import TensorDataset  # for wrapping tensors
from torch.utils.data import DataLoader     # for managing batches

# Numpy, scipy, and plotting:
import numpy as np
from scipy.stats import norm         # Gaussian fitting
import scipy.integrate as integrate  # integration
import matplotlib.pyplot as plt      # plotting
import seaborn as sns; sns.set()     # nicer plotting
import pandas as pd                  # dataframe for use with seaborn

# File i/o:
import pickle  # for unpickling MNIST data
import gzip    # for opening pickled MNIST data file
import h5py    # HDF5

# Miscellaneous:
import math
import random  # random number generators
import re      # regular expressions
import gc      # garbage collection

On my local machine, I import MNIST from elsewhere on the drive; the cleanest way I found for doing it in Colab is instead to load it via Keras:

# load MNIST:
from keras.datasets import mnist
(x_train, y_train), (x_valid, y_valid) = mnist.load_data()

# convert from numpy to torch tensors:
x_train, y_train, x_valid, y_valid = map(torch.from_numpy, (x_train, y_train, x_valid, y_valid))

For my purposes, I needed layers with the parameters initialised along a Gaussian, rather than the uniform distribution used by the built-in Linear layer, so I’ve created a custom class (which will be subsequently loaded into an nn.Sequential object):

# linear layer z=Wx+b, with W,b drawn from normal distributions:
class GaussianLinear(nn.Module):
    def __init__(self, size_in, size_out, var_W, var_b):
        super().__init__()
        self.size_in, self.size_out = size_in, size_out
        self.var_W, self.var_b = var_W/size_in, var_b  # n.b., must scale var_W by layer width!

        # normally distributed weights with mean=0 and variance=var_W/N:
        norm_vec = torch.normal(mean=torch.zeros(size_out*size_in), std=math.sqrt(self.var_W))
        self.weights = nn.Parameter(norm_vec.view(size_out, size_in))
        
        # normally distributed biases with mean=0 and variance=var_b:
        self.bias = nn.Parameter(torch.normal(mean=torch.zeros(size_out), std=math.sqrt(var_b)))

    def forward(self, x):
        prod = torch.mm(x, self.weights.t())  # Wx
        return torch.add(prod, self.bias)     # Wx+b

For training, I’m using the built-in SGD optimiser with PyTorch’s version of the cross-entropy as the loss function:

# compute gradients & update parameters for a single batch, given loss function & optimizer:
def loss_batch(model, loss_func, xb, yb, opt=None):
    loss = loss_func(model(xb), yb)  # compute specified loss function for the model

    if opt is not None:
        loss.backward()  # compute gradients
        opt.step()       # update parameters
        opt.zero_grad()  # zero gradients in preparation for next iteration

    # n.b., detaching returns the value of the loss; without, returns entire computation graph!
    return loss.detach().item(), len(xb)

# compute accuracy; predicted digit corresponds to index with maximum value:
def accuracy(preds, yb):
    preds = torch.argmax(preds, 1)       # max argument along axis 1 (n.b., batch size must be > 1, else error)
    return (preds == yb).float().mean()  # for each element: 1 if prediction matches target, 0 otherwise

# train & evaluate the model, given loss function, optimizer, and DataLoaders:
def fit(epochs, model, loss_func, opt, train_dl, valid_dl, acc_list=-1, loss_list=-1):
    for epoch in range(epochs):
        model.train()  # ensure training mode (e.g., dropout)
        for xb, yb in train_dl:
            loss_batch(model, loss_func, xb, yb, opt)

        model.eval()   # ensure evaluation mode (e.g., no dropout)
        with torch.no_grad():
            # compute loss:
            losses, nums = zip(*[loss_batch(model, loss_func, xb, yb) for xb, yb in valid_dl])  # * unzips
            val_loss = np.sum(np.multiply(losses, nums)) / np.sum(nums)
            
            # compute accuracy:
            accuracies = np.array([accuracy(model(xb), yb) for xb, yb in valid_dl])
            val_acc = np.sum(accuracies)/len(accuracies)
            
        print(epoch, val_loss, val_acc)  # monitor progress
        
        # save progress only if user passed lists (for speed if not):
        if isinstance(loss_list, list):
            loss_list.append(val_loss)
        if isinstance(acc_list, list):
            acc_list.append(val_acc)

While I’d be surprised if it’s relevant here seeing as the error arises even with hooks disengaged, I’ve included them for the purposes of extracting pre-activations for analysis (in a different notebook);

# simple class to store layer inputs & outputs:
class Hook():
    def __init__(self, module, input=None, output=None):
        self.hook = module.register_forward_hook(self.hook_fn)
        self.input = input
        #self.output = output

    def hook_fn(self, module, input, output):
        self.input.append(input[0].detach())
        #self.output.append(output.detach())
        
    def close(self):
        self.hook.remove()

# function that recursively registers hooks on Tanh layers:
def hook_layers(net, inputs, outputs):
    for name, layer in net._modules.items():
        # if nn.Sequential, register recursively on constituent modules:
        if isinstance(layer, nn.Sequential):
            hook_layers(layer, inputs, outputs)
        # individual module, register hook only on Tanh:
        elif isinstance(layer, nn.Tanh):
            Hook(layer, inputs, outputs) 

For convenience, I’m using PyTorch’s DataLoader utility to handle batch management:

# return DataLoaders for training and validation sets, for batch management:
def get_data(x_train, y_train, x_valid, y_valid, batch_size):
    return (DataLoader(TensorDataset(x_train, y_train), batch_size, shuffle=False),
            DataLoader(TensorDataset(x_valid, y_valid), batch_size*2))

Now, to actually construct the networks, I’ve created the following class, which facilitates playing with different depths (the steady width reduction is chosen for research reasons):

# Construct a Gaussian neural network consisting of GaussianLinear layers followed by Tanh layers.
# The layer widths are steadily reduced from input_dim to output_dim
# in step sizes of (input.dim - output.dim)/n_layers (n.b., implies max depth).
def build_network(num_layers, input_dim, output_dim, var_w, var_b): 
    # determine how much to shrink each layer:
    diff = input_dim - output_dim   
    if num_layers > diff:
        raise Exception('Specified number of layers exceeds maximum value consistent\n'
                        'with monotonic reduction in layer widths. Max allowed depth is {}'.format(diff))
            
    shrink = math.floor(diff/num_layers)  # n.b., rounding up can over-decimate in deep networks!
    
    # compute layer widths:
    widths = []
    for i in range(num_layers):
        widths.append(input_dim - shrink*i)      
    
    # output layer:
    widths.append(output_dim)
    
    # construct and add layers to list (no need to use nn.ModuleList):
    mlist = []
    for i in range(num_layers):
        mlist.append(GaussianLinear(widths[i], widths[i+1], var_w, var_b))
        mlist.append(nn.Tanh())
    
    return nn.Sequential(*mlist)

Finally, I have a function that encapsulates the creation and training of different models, and optionally writes various data to disk in hdf5 format (the error occurs even with all write flags set to False, so I’ve not bothered to include the relevant write functions here, since they’re not relevant):

# construct and train models with a range of depths;
# pass -1 (or any non-str) for file names to avoid writing:
def train_models(depth_min, depth_max, depth_step=1, file_acc='accuracies.hdf5',
                 write_params=False, file_params='parameters.hdf5',
                 hooks=False, file_hook='hooks.hdf5', 
                 save_model=False, file_model='model.hdf5'):
    depth = np.arange(depth_min, depth_max, depth_step)
    print('Depth list: ', depth)
            
    # construct new set of models & associated optimizers:
    model = []
    opt = []
    for i,d in enumerate(depth):
        # create model, append to list:
        model.append(build_network(d, 784, 10, var_weight, var_bias))

        # create optimizer:
        opt.append(optim.SGD(model[i].parameters(), rate, momentum))

    # optionally add hooks to store layer inputs/outputs:
    if hooks:
        inputs, outputs = [], []
        for i in range(len(model)):
            inputs.append([])
            outputs.append([])
            hook_layers(model[i], inputs[i], outputs[i])
            
    # train models, optionally write data:
    for i in range(len(model)):
        accuracies = []  # store accuracies
        
        print('\nTraining model ', i, ' with depth ', depth[i], '...')
        
        fit(epochs, model[i], loss_func, opt[i], train_dl, valid_dl, accuracies)
        
        if save_model:
            model_name = PATH_TO_DATA + re.sub('\.(.*)$','',file_model) + '-{}.hdf5'.format(depth[i])
            torch.save(model[i].state_dict(), model_name)
        
        # optionally write accuracies in hdf5 format:
        if isinstance(file_acc, str):
            write_accuracies(file_acc, depth[i], accuracies, var_weight, var_bias)

        # optionally write weights, biases in hdf5 format:
        if write_params and isinstance(file_params, str):
            write_parameters(file_params, model[i], depth[i], var_weight, var_bias)
            
        # optionally write hooks in hdf5 format:
        if hooks and isinstance(file_hook, str):
            write_hooks(file_hook, depth[i], inputs[i], outputs[i])
            # clear memory:
            model[i], opt[i], inputs[i], outputs[i] = -1, -1, -1, -1
            gc.collect()
        
    print('\nTraining complete.\n')

For completeness, here’s how I’m setting hyper-parameters:

# set hyperparameters:
rate = 0.005
epochs = 20
momentum = 0.8
batch_size = 64

# load training & validation data into DataLoaders:
train_dl, valid_dl = get_data(x_train, y_train, x_valid, y_valid, batch_size)

# set loss function:
loss_func = F.cross_entropy

Now, the idea is that I fix the (distribution of) weights and biases, and can easily construct and run a list of networks of arbitrary depths between, in this case, 10 and 50 (inclusive) in step-sizes of 5. As mentioned above you can simply ignore the various file i/o arguments.

# variances for Gaussian parameter initialization:
var_weight = 1.5
var_bias = 0.05

train_models(10,51,5, file_acc='acc-150.hdf5', write_params=False, file_params='para-150.hdf5', hooks=False, file_hook='hooks-150.hdf5', save_model=False, file_model='model-150.hdf5')

The above cell throws the error; the full output is:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-28-033c337389bd> in <module>()
      5 train_models(10,11,1, file_acc='acc-150.hdf5',
      6              write_params=False, file_params='para-150.hdf5',
----> 7              hooks=False, file_hook='hooks-150.hdf5', save_model=False, file_model='model-150.hdf5')

6 frames
<ipython-input-22-3ddf6466441f> in train_models(depth_min, depth_max, depth_step, file_acc, write_params, file_params, hooks, file_hook, save_model, file_model)
     32         print('\nTraining model ', i, ' with depth ', depth[i], '...')
     33 
---> 34         fit(epochs, model[i], loss_func, opt[i], train_dl, valid_dl, accuracies)
     35 
     36         if save_model:

<ipython-input-18-c423efcb85e5> in fit(epochs, model, loss_func, opt, train_dl, valid_dl, acc_list, loss_list)
     21         model.train()  # ensure training mode (e.g., dropout)
     22         for xb, yb in train_dl:
---> 23             loss_batch(model, loss_func, xb, yb, opt)
     24 
     25         model.eval()   # ensure evaluation mode (e.g., no dropout)

<ipython-input-18-c423efcb85e5> in loss_batch(model, loss_func, xb, yb, opt)
      1 # compute gradients & update parameters for a single batch, given loss function & optimizer:
      2 def loss_batch(model, loss_func, xb, yb, opt=None):
----> 3     loss = loss_func(model(xb), yb)  # compute specified loss function for the model
      4 
      5     if opt is not None:

/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    887             result = self._slow_forward(*input, **kwargs)
    888         else:
--> 889             result = self.forward(*input, **kwargs)
    890         for hook in itertools.chain(
    891                 _global_forward_hooks.values(),

/usr/local/lib/python3.7/dist-packages/torch/nn/modules/container.py in forward(self, input)
    117     def forward(self, input):
    118         for module in self:
--> 119             input = module(input)
    120         return input
    121 

/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    887             result = self._slow_forward(*input, **kwargs)
    888         else:
--> 889             result = self.forward(*input, **kwargs)
    890         for hook in itertools.chain(
    891                 _global_forward_hooks.values(),

<ipython-input-16-e26371d4d8d9> in forward(self, x)
     14 
     15     def forward(self, x):
---> 16         prod = torch.mm(x, self.weights.t())  # Wx
     17         return torch.add(prod, self.bias)     # Wx+b

RuntimeError: self must be a matrix

Thanks for your help, and sincere apologies for the length!