Dear Community,

My vanilla RNN has a lower test loss and higher accuracy than my respective training metrics as shown in the prints shown below:

```
Simple RNN initalised with 1 layers and 6 number of hidden neurons.
Epoch:1 Train[Loss:2.29849 Top1 Acc:0.15428]
Epoch:1 Test[Loss:2.29271 Top1 Acc:0.243]
Epoch:2 Train[Loss:2.28381 Top1 Acc:0.27862]
Epoch:2 Test[Loss:2.27542 Top1 Acc:0.3129]
...
Epoch:7 Train[Loss:2.15966 Top1 Acc:0.47402]
Epoch:7 Test[Loss:2.14895 Top1 Acc:0.4761]
Epoch:8 Train[Loss:2.13435 Top1 Acc:0.4836]
Epoch:8 Test[Loss:2.1245 Top1 Acc:0.4859]
```

I have tried investigating this, by:

- Removing any regularisation (i.e., set weight decay to zero, no dropout or so was initially used)
- Checking the training loop and how accuracy is computed within. Here I corrrected for a small bias introduced by making accuracy dependent on the entire data rather than batchsize.
- Removed initalisation of weights, removed warmup with cosine annealing for a constant learning rate of 0.00001.

However despite this, test loss and accuracy still outperform training loss and accuracy by a small amount, regardless of which configuration I use. I have added my Rnncell, my SimpleRNN and maybe more importantly my training loop below. Any thoughts or idea on what could cause this behavior would be appreciated. Could these small differences be negledigble ?

The code will of course also run by copy pasting it into a google collab cell or notebook cell. Please let me know.

**Imports**

```
import torch
from torch import nn
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import numpy as np
```

**RnnCell**

```
class RnnCell(nn.Module):
def __init__(self, input_size, hidden_size, activation="tanh"):
super(RnnCell, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.activation = activation
if self.activation not in ["tanh", "relu", "sigmoid"]:
raise ValueError("Invalid nonlinearity selected for RNN. Please use tanh, relu or sigmoid.")
self.input2hidden = nn.Linear(input_size, hidden_size)
# hidden2hidden when we have more than 1 RNN stacked
# hidden2out when we have only 1 RNN
self.hidden2hidden = nn.Linear(hidden_size, hidden_size)
self.init_weights_normal()
def forward(self, input, hidden_state = None):
'''
Inputs: input (torch tensor) of shape [batchsize, input_size]
hidden state (torch tensor) of shape [batchsize, hiddensize]
Output: output (torch tensor) of shape [batchsize, hiddensize ]
'''
# initalise hidden state at first iteration so if none
if hidden_state is None:
hidden_state = torch.zeros(input.shape[0], self.hidden_size).to(device)
hidden_state = (self.input2hidden(input) + self.hidden2hidden(hidden_state))
# takes output from hidden and apply activation
if self.activation == "tanh":
out = torch.tanh(hidden_state)
elif self.activation == "relu":
out = torch.relu(hidden_state)
elif self.activation == "sigmoid":
out = torch.sigmoid(hidden_state)
return out
def init_weights_normal(self):
# iterate over parameters or weights theta
# and initalise them with a normal centered at 0 with 0.02 spread.
for weight in self.parameters():
weight.data.normal_(0, 0.02)
```

**Simple RNN**

```
class SimpleRNN(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, output_size, activation='relu'):
super(SimpleRNN, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.output_size = output_size
self.rnn_cell_list = nn.ModuleList()
if activation == 'tanh':
self.rnn_cell_list.append(RnnCell(self.input_size,
self.hidden_size,
"tanh"))
for l in range(1, self.num_layers):
self.rnn_cell_list.append(RnnCell(self.hidden_size,
self.hidden_size,
"tanh"))
elif activation == 'relu':
self.rnn_cell_list.append(RnnCell(self.input_size,
self.hidden_size,
"relu"))
for l in range(1, self.num_layers):
self.rnn_cell_list.append(RnnCell(self.hidden_size,
self.hidden_size,
"relu"))
elif activation == 'sigmoid':
self.rnn_cell_list.append(RnnCell(self.input_size,
self.hidden_size,
"sigmoid"))
for l in range(1, self.num_layers):
self.rnn_cell_list.append(RnnCell(self.hidden_size,
self.hidden_size,
"sigmoid"))
else:
raise ValueError("Invalid activation. Please use tanh, relu or sigmoid activation.")
self.fc = nn.Linear(self.hidden_size, self.output_size)
#self.sigmoid = nn.Sigmoid()
def forward(self, input, hidden_state=None):
'''
Inputs: input (torch tensor) of shape [batchsize, seqence length, inputsize]
Output: output (torch tensor) of shape [batchsize, outputsize]
'''
# initalise hidden state at first timestep so if none
if hidden_state is None:
# hidden_state_0 = torch.zeros(self.num_layers, input.size(0), self.hidden_size).to(device)
hidden_state = torch.zeros(self.num_layers, input.size(0), self.hidden_size).to(device)
# else set
#else:
# hidden_state_0 = hidden_state
outs = []
hidden = list()
for layer in range(self.num_layers):
# hidden.append(hidden_state_0[layer, :, :])
hidden.append(hidden_state[layer, :, :])
for t in range(input.size(1)):
for layer in range(self.num_layers):
if layer == 0:
hidden_l = self.rnn_cell_list[layer](input[:, t, :], hidden[layer])
else:
hidden_l = self.rnn_cell_list[layer](hidden[layer - 1], hidden[layer])
hidden[layer] = hidden_l
#hidden[layer] = hidden_l
outs.append(hidden_l)
# select last time step indexed at [-1]
out = outs[-1].squeeze()
out = self.fc(out)
return out
```

**Training**

```
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
weight_decay = 0
sequence_length = 28*28
input_size = 28
hidden_size = 6
nlayers = 1
nclasses = 10
batch_size = 64
nepochs = 50
T_max = nepochs - 5
lr = 0.00001
save_model = True
continue_training = False
data_dir = 'data/'
def train (train_loader, model, optimizer, loss_f):
'''
Input: train loader (torch loader), model (torch model), optimizer (torch optimizer)
loss function (torch custom yolov1 loss).
Output: loss (torch float).
'''
model.train()
loss_lst = []
#top1_acc_lst = []
#top5_acc_lst = []
correct = 0
for batch_idx, (x, y) in enumerate(train_loader):
x, y = x.to(device), y.to(device)
# turn [64, 784] to [64, 784, 784]
x_expanded = x[:, None, ...].expand(x.shape[0], x.shape[1], x.shape[1]).to(device)
out = model(x_expanded)
del x
del x_expanded
out = F.softmax(out, dim = 1)
pred = torch.argmax(out, dim = 1)
correct += sum(pred == y)
loss_val = loss_f(out, y)
loss_lst.append(float(loss_val.item()))
optimizer.zero_grad()
loss_val.backward()
optimizer.step()
# compute the average within each list to obtain final value for a single epoch
loss_val = lst_avg(loss_lst)
train_acc = round(correct.item() / len(train_loader.dataset), 5)
return (loss_val, train_acc)
def test (test_loader, model, loss_f):
'''
Input: test loader (torch loader), model (torch model), loss function
(torch custom yolov1 loss).
Output: test loss (torch float).
'''
test_loss_lst = []
model.eval()
correct = 0
with torch.no_grad():
for batch_idx, (x, y) in enumerate(test_loader):
x, y = x.to(device), y.to(device)
x_expanded = x[:, None, ...].expand(x.shape[0], x.shape[1], x.shape[1]).to(device)
out = model(x_expanded)
del x
batchsize = x_expanded.shape[0]
del x_expanded
out = F.softmax(out, dim = 1)
pred = torch.argmax(out, dim = 1)
correct += sum(pred == y)
test_loss_val = loss_f(out, y)
test_loss_lst.append(float(test_loss_val.item()))
test_loss_val = lst_avg(test_loss_lst)
test_acc = round(correct.item() / len(test_loader.dataset), 5)
return (test_loss_val, test_acc)
def main():
print(f'Simple RNN initalised with {nlayers} layers and {hidden_size} number of hidden neurons.')
model = SimpleRNN(input_size = input_size*input_size, hidden_size = hidden_size, num_layers=nlayers, output_size = 10, activation = 'relu').to(device)
optimizer = optim.Adam(model.parameters(), lr = lr, weight_decay = weight_decay)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max = 145, eta_min = 0)
loss_f = nn.CrossEntropyLoss()
train_loss_lst = []
test_loss_lst = []
train_top1acc_lst = []
test_top1acc_lst = []
last_epoch = 0
train_dataset = torchvision.datasets.MNIST(root = data_dir,
train=True,
transform=T.Compose([T.ToTensor(), T.Lambda(torch.flatten)]),
download=True)
test_dataset = torchvision.datasets.MNIST(root = data_dir,
train = False,
transform=T.Compose([T.ToTensor(), T.Lambda(torch.flatten)]))
train_loader = DataLoader(dataset=train_dataset,
batch_size = batch_size,
shuffle = True)
test_loader = DataLoader(dataset=test_dataset,
batch_size = batch_size,
shuffle = False)
for epoch in range(nepochs - last_epoch):
train_loss_value, train_top1acc_value= train(train_loader, model, optimizer, loss_f)
train_loss_lst.append(train_loss_value)
train_top1acc_lst.append(train_top1acc_value)
test_loss_value, test_top1acc_value = test(test_loader, model, loss_f)
test_loss_lst.append(test_loss_value)
test_top1acc_lst.append(test_top1acc_value)
{test_top5acc_value} Top1 Acc:{test_top1acc_value}]")
print(f"Epoch:{epoch + last_epoch + 1 } Train[Loss:{train_loss_value} Top1 Acc:{train_top1acc_value}]")
print(f"Epoch:{epoch + last_epoch + 1 } Test[Loss:{test_loss_value} Top1 Acc:{test_top1acc_value}]")
if __name__ == "__main__":
main()
```

All the best,

weight_theta