I thought so (which I suppose is good news, once I find out what’s killing it). The relevant sections of the code – a simple feed-forward random network trained on MNIST – are below, starting with the following imports:

```
# PyTorch packages:
import torch
import torch.nn as nn # neural net package
import torch.nn.functional as F # useful functions, e.g., convolutions & loss functions
from torch import optim # optimizers (torch.optim)
from torch.utils.data import TensorDataset # for wrapping tensors
from torch.utils.data import DataLoader # for managing batches
# Numpy, scipy, and plotting:
import numpy as np
from scipy.stats import norm # Gaussian fitting
import scipy.integrate as integrate # integration
import matplotlib.pyplot as plt # plotting
import seaborn as sns; sns.set() # nicer plotting
import pandas as pd # dataframe for use with seaborn
# File i/o:
import pickle # for unpickling MNIST data
import gzip # for opening pickled MNIST data file
import h5py # HDF5
# Miscellaneous:
import math
import random # random number generators
import re # regular expressions
import gc # garbage collection
```

On my local machine, I import MNIST from elsewhere on the drive; the cleanest way I found for doing it in Colab is instead to load it via Keras:

```
# load MNIST:
from keras.datasets import mnist
(x_train, y_train), (x_valid, y_valid) = mnist.load_data()
# convert from numpy to torch tensors:
x_train, y_train, x_valid, y_valid = map(torch.from_numpy, (x_train, y_train, x_valid, y_valid))
```

For my purposes, I needed layers with the parameters initialised along a Gaussian, rather than the uniform distribution used by the built-in `Linear`

layer, so I’ve created a custom class (which will be subsequently loaded into an `nn.Sequential`

object):

```
# linear layer z=Wx+b, with W,b drawn from normal distributions:
class GaussianLinear(nn.Module):
def __init__(self, size_in, size_out, var_W, var_b):
super().__init__()
self.size_in, self.size_out = size_in, size_out
self.var_W, self.var_b = var_W/size_in, var_b # n.b., must scale var_W by layer width!
# normally distributed weights with mean=0 and variance=var_W/N:
norm_vec = torch.normal(mean=torch.zeros(size_out*size_in), std=math.sqrt(self.var_W))
self.weights = nn.Parameter(norm_vec.view(size_out, size_in))
# normally distributed biases with mean=0 and variance=var_b:
self.bias = nn.Parameter(torch.normal(mean=torch.zeros(size_out), std=math.sqrt(var_b)))
def forward(self, x):
prod = torch.mm(x, self.weights.t()) # Wx
return torch.add(prod, self.bias) # Wx+b
```

For training, I’m using the built-in SGD optimiser with PyTorch’s version of the cross-entropy as the loss function:

```
# compute gradients & update parameters for a single batch, given loss function & optimizer:
def loss_batch(model, loss_func, xb, yb, opt=None):
loss = loss_func(model(xb), yb) # compute specified loss function for the model
if opt is not None:
loss.backward() # compute gradients
opt.step() # update parameters
opt.zero_grad() # zero gradients in preparation for next iteration
# n.b., detaching returns the value of the loss; without, returns entire computation graph!
return loss.detach().item(), len(xb)
# compute accuracy; predicted digit corresponds to index with maximum value:
def accuracy(preds, yb):
preds = torch.argmax(preds, 1) # max argument along axis 1 (n.b., batch size must be > 1, else error)
return (preds == yb).float().mean() # for each element: 1 if prediction matches target, 0 otherwise
# train & evaluate the model, given loss function, optimizer, and DataLoaders:
def fit(epochs, model, loss_func, opt, train_dl, valid_dl, acc_list=-1, loss_list=-1):
for epoch in range(epochs):
model.train() # ensure training mode (e.g., dropout)
for xb, yb in train_dl:
loss_batch(model, loss_func, xb, yb, opt)
model.eval() # ensure evaluation mode (e.g., no dropout)
with torch.no_grad():
# compute loss:
losses, nums = zip(*[loss_batch(model, loss_func, xb, yb) for xb, yb in valid_dl]) # * unzips
val_loss = np.sum(np.multiply(losses, nums)) / np.sum(nums)
# compute accuracy:
accuracies = np.array([accuracy(model(xb), yb) for xb, yb in valid_dl])
val_acc = np.sum(accuracies)/len(accuracies)
print(epoch, val_loss, val_acc) # monitor progress
# save progress only if user passed lists (for speed if not):
if isinstance(loss_list, list):
loss_list.append(val_loss)
if isinstance(acc_list, list):
acc_list.append(val_acc)
```

While I’d be surprised if it’s relevant here seeing as the error arises even with hooks disengaged, I’ve included them for the purposes of extracting pre-activations for analysis (in a different notebook);

```
# simple class to store layer inputs & outputs:
class Hook():
def __init__(self, module, input=None, output=None):
self.hook = module.register_forward_hook(self.hook_fn)
self.input = input
#self.output = output
def hook_fn(self, module, input, output):
self.input.append(input[0].detach())
#self.output.append(output.detach())
def close(self):
self.hook.remove()
# function that recursively registers hooks on Tanh layers:
def hook_layers(net, inputs, outputs):
for name, layer in net._modules.items():
# if nn.Sequential, register recursively on constituent modules:
if isinstance(layer, nn.Sequential):
hook_layers(layer, inputs, outputs)
# individual module, register hook only on Tanh:
elif isinstance(layer, nn.Tanh):
Hook(layer, inputs, outputs)
```

For convenience, I’m using PyTorch’s `DataLoader`

utility to handle batch management:

```
# return DataLoaders for training and validation sets, for batch management:
def get_data(x_train, y_train, x_valid, y_valid, batch_size):
return (DataLoader(TensorDataset(x_train, y_train), batch_size, shuffle=False),
DataLoader(TensorDataset(x_valid, y_valid), batch_size*2))
```

Now, to actually construct the networks, I’ve created the following class, which facilitates playing with different depths (the steady width reduction is chosen for research reasons):

```
# Construct a Gaussian neural network consisting of GaussianLinear layers followed by Tanh layers.
# The layer widths are steadily reduced from input_dim to output_dim
# in step sizes of (input.dim - output.dim)/n_layers (n.b., implies max depth).
def build_network(num_layers, input_dim, output_dim, var_w, var_b):
# determine how much to shrink each layer:
diff = input_dim - output_dim
if num_layers > diff:
raise Exception('Specified number of layers exceeds maximum value consistent\n'
'with monotonic reduction in layer widths. Max allowed depth is {}'.format(diff))
shrink = math.floor(diff/num_layers) # n.b., rounding up can over-decimate in deep networks!
# compute layer widths:
widths = []
for i in range(num_layers):
widths.append(input_dim - shrink*i)
# output layer:
widths.append(output_dim)
# construct and add layers to list (no need to use nn.ModuleList):
mlist = []
for i in range(num_layers):
mlist.append(GaussianLinear(widths[i], widths[i+1], var_w, var_b))
mlist.append(nn.Tanh())
return nn.Sequential(*mlist)
```

Finally, I have a function that encapsulates the creation and training of different models, and optionally writes various data to disk in hdf5 format (the error occurs even with all `write`

flags set to `False`

, so I’ve not bothered to include the relevant write functions here, since they’re not relevant):

```
# construct and train models with a range of depths;
# pass -1 (or any non-str) for file names to avoid writing:
def train_models(depth_min, depth_max, depth_step=1, file_acc='accuracies.hdf5',
write_params=False, file_params='parameters.hdf5',
hooks=False, file_hook='hooks.hdf5',
save_model=False, file_model='model.hdf5'):
depth = np.arange(depth_min, depth_max, depth_step)
print('Depth list: ', depth)
# construct new set of models & associated optimizers:
model = []
opt = []
for i,d in enumerate(depth):
# create model, append to list:
model.append(build_network(d, 784, 10, var_weight, var_bias))
# create optimizer:
opt.append(optim.SGD(model[i].parameters(), rate, momentum))
# optionally add hooks to store layer inputs/outputs:
if hooks:
inputs, outputs = [], []
for i in range(len(model)):
inputs.append([])
outputs.append([])
hook_layers(model[i], inputs[i], outputs[i])
# train models, optionally write data:
for i in range(len(model)):
accuracies = [] # store accuracies
print('\nTraining model ', i, ' with depth ', depth[i], '...')
fit(epochs, model[i], loss_func, opt[i], train_dl, valid_dl, accuracies)
if save_model:
model_name = PATH_TO_DATA + re.sub('\.(.*)$','',file_model) + '-{}.hdf5'.format(depth[i])
torch.save(model[i].state_dict(), model_name)
# optionally write accuracies in hdf5 format:
if isinstance(file_acc, str):
write_accuracies(file_acc, depth[i], accuracies, var_weight, var_bias)
# optionally write weights, biases in hdf5 format:
if write_params and isinstance(file_params, str):
write_parameters(file_params, model[i], depth[i], var_weight, var_bias)
# optionally write hooks in hdf5 format:
if hooks and isinstance(file_hook, str):
write_hooks(file_hook, depth[i], inputs[i], outputs[i])
# clear memory:
model[i], opt[i], inputs[i], outputs[i] = -1, -1, -1, -1
gc.collect()
print('\nTraining complete.\n')
```

For completeness, here’s how I’m setting hyper-parameters:

```
# set hyperparameters:
rate = 0.005
epochs = 20
momentum = 0.8
batch_size = 64
# load training & validation data into DataLoaders:
train_dl, valid_dl = get_data(x_train, y_train, x_valid, y_valid, batch_size)
# set loss function:
loss_func = F.cross_entropy
```

Now, the idea is that I fix the (distribution of) weights and biases, and can easily construct and run a list of networks of arbitrary depths between, in this case, 10 and 50 (inclusive) in step-sizes of 5. As mentioned above you can simply ignore the various file i/o arguments.

```
# variances for Gaussian parameter initialization:
var_weight = 1.5
var_bias = 0.05
train_models(10,51,5, file_acc='acc-150.hdf5', write_params=False, file_params='para-150.hdf5', hooks=False, file_hook='hooks-150.hdf5', save_model=False, file_model='model-150.hdf5')
```

The above cell throws the error; the full output is:

```
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-28-033c337389bd> in <module>()
5 train_models(10,11,1, file_acc='acc-150.hdf5',
6 write_params=False, file_params='para-150.hdf5',
----> 7 hooks=False, file_hook='hooks-150.hdf5', save_model=False, file_model='model-150.hdf5')
6 frames
<ipython-input-22-3ddf6466441f> in train_models(depth_min, depth_max, depth_step, file_acc, write_params, file_params, hooks, file_hook, save_model, file_model)
32 print('\nTraining model ', i, ' with depth ', depth[i], '...')
33
---> 34 fit(epochs, model[i], loss_func, opt[i], train_dl, valid_dl, accuracies)
35
36 if save_model:
<ipython-input-18-c423efcb85e5> in fit(epochs, model, loss_func, opt, train_dl, valid_dl, acc_list, loss_list)
21 model.train() # ensure training mode (e.g., dropout)
22 for xb, yb in train_dl:
---> 23 loss_batch(model, loss_func, xb, yb, opt)
24
25 model.eval() # ensure evaluation mode (e.g., no dropout)
<ipython-input-18-c423efcb85e5> in loss_batch(model, loss_func, xb, yb, opt)
1 # compute gradients & update parameters for a single batch, given loss function & optimizer:
2 def loss_batch(model, loss_func, xb, yb, opt=None):
----> 3 loss = loss_func(model(xb), yb) # compute specified loss function for the model
4
5 if opt is not None:
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
887 result = self._slow_forward(*input, **kwargs)
888 else:
--> 889 result = self.forward(*input, **kwargs)
890 for hook in itertools.chain(
891 _global_forward_hooks.values(),
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/container.py in forward(self, input)
117 def forward(self, input):
118 for module in self:
--> 119 input = module(input)
120 return input
121
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
887 result = self._slow_forward(*input, **kwargs)
888 else:
--> 889 result = self.forward(*input, **kwargs)
890 for hook in itertools.chain(
891 _global_forward_hooks.values(),
<ipython-input-16-e26371d4d8d9> in forward(self, x)
14
15 def forward(self, x):
---> 16 prod = torch.mm(x, self.weights.t()) # Wx
17 return torch.add(prod, self.bias) # Wx+b
RuntimeError: self must be a matrix
```

Thanks for your help, and sincere apologies for the length!