Transformer on Sequential Mnist: Index out of range

Dear Community,

I am currently debugging a transformer implementation with the base torch transformer on the sequential mnist task. In the task the 28 by 28 mnist image is flattened to 784 and the model receives a single pixel, so the input size = 1 of the entire image at each time step t, so from t=0 to T=784. As such the goal is to examine the context length of a transformer. My code can be copy pasted and run as below to replicate the error message for IndexError: index out of range in self as shown below with the entire error message.

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets
from torchvision import transforms as T
from torch.utils.data import DataLoader
import torch.optim as optim
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 64
weight_decay = 0.0

epochs = 50
nworkers = 2
lr = 0.001

input_size = 1
embedding_dim = 256
output_size = 10
data_dir = 'data/'

class TransformerModel(nn.Module):
    def __init__(self, input_size, embedding_dim, output_size):
        super(TransformerModel, self).__init__()
        self.embedding_dim = embedding_dim

        self.embedding = nn.Embedding(input_size, embedding_dim)
        self.transformer = nn.Transformer(
            d_model=embedding_dim, nhead=8, num_encoder_layers=6
        )
        self.fc = nn.Linear(embedding_dim, output_size)

    def forward(self, xs):
        embeddings = self.embedding(xs)
        out = self.transformer(embeddings)  # Pass the embeddings through the Transformer
        out = self.fc(out.mean(dim=0))  # Average embeddings over the sequence dimension
        return out


train_dataset = datasets.MNIST(root=data_dir,
                               train=True,
                               transform=T.Compose([T.ToTensor(),
                                                   T.Normalize((0.5,), (0.5,))]),
                               download=True)

train_loader = DataLoader(dataset=train_dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          drop_last=True)

model = TransformerModel(input_size=input_size, embedding_dim=embedding_dim, output_size=output_size).to(device)
print(model)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)
loss_f = nn.CrossEntropyLoss()

for epoch in range(epochs):
    model.train()  # Set the model to training mode
    loss_lst = []
    correct = 0
    total = 0

    for batch_idx, (x, y) in enumerate(train_loader):
        x, y = x.to(device), y.to(device)
        x = x.view(-1, batch_size, input_size)  # Reshape input to (sequence_length, batch_size, input_size)'
        x = x.permute(1, 0, 2)  # Permute dimensions for Transformer: (batch_size, sequence_length, input_size)
        x = x.to(torch.long)
        out = model(x)
        loss_val = loss_f(out, y)
        loss_lst.append(float(loss_val.item()))
        optimizer.zero_grad()
        loss_val.backward()
        optimizer.step()

        classprobs = F.softmax(out, dim=1)
        preds = classprobs.argmax(dim=1)
        total += y.size(0)
        correct += (preds == y).sum().item()

    loss_val = round(sum(loss_lst) / len(loss_lst), 4)
    accuracy = round(correct / total, 4)
    print(f"Epoch:{epoch + 1}   Train[Loss:{loss_val}  Accuracy:{accuracy}]")
    scheduler.step()

I am debugging on cpu, with the following error:

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-3-4e6d43608921> in <cell line: 60>()
     73         x = x.to(torch.long)
     74         print(x.shape)
---> 75         out = model(x)
     76         loss_val = loss_f(out, y)
     77         loss_lst.append(float(loss_val.item()))

4 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
   1499                 or _global_backward_pre_hooks or _global_backward_hooks
   1500                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501             return forward_call(*args, **kwargs)
   1502         # Do not call functions when jit is used
   1503         full_backward_hooks, non_full_backward_hooks = [], []

<ipython-input-3-4e6d43608921> in forward(self, xs)
     33 
     34     def forward(self, xs):
---> 35         embeddings = self.embedding(xs)
     36         out = self.transformer(embeddings)  # Pass the embeddings through the Transformer
     37         out = self.fc(out.mean(dim=0))  # Average embeddings over the sequence dimension

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
   1499                 or _global_backward_pre_hooks or _global_backward_hooks
   1500                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501             return forward_call(*args, **kwargs)
   1502         # Do not call functions when jit is used
   1503         full_backward_hooks, non_full_backward_hooks = [], []

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/sparse.py in forward(self, input)
    160 
    161     def forward(self, input: Tensor) -> Tensor:
--> 162         return F.embedding(
    163             input, self.weight, self.padding_idx, self.max_norm,
    164             self.norm_type, self.scale_grad_by_freq, self.sparse)

/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
   2208         # remove once script supports set_grad_enabled
   2209         _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2210     return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
   2211 
   2212 

IndexError: index out of range in self

Your normalized and transformed input will contain negative values, which are not accepted in the nn.Embedding layer, since words indices in [0, num_embeddings-1] are expected.

Thank you Patrick Black. However removing the normalization, results in another error as shown below. Unofrtunately, sequential mnist for transformers in pytorch is not that well documented. Would you have a pointer or an idea how to resolve all of this ? I want to gather some data on how well a transformer can memorise context length by feeding each pixel from the sequence individually to the model.

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-1-05e84794e837> in <cell line: 59>()
     68         x = x.permute(1, 0, 2)  # Permute dimensions for Transformer: (batch_size, sequence_length, input_size)
     69         x = x.to(torch.long)
---> 70         out = model(x)
     71         loss_val = loss_f(out, y)
     72         loss_lst.append(float(loss_val.item()))

4 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
   2208         # remove once script supports set_grad_enabled
   2209         _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2210     return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
   2211 
   2212 

IndexError: index out of range in self

Without normalizing the input each pixel will contain values in the range [0, 255] so you might need to adapt the embedding_dim argument and set it to 256.

The embedding_dim is already set to 256. See embedding_dim = 256. Played around with the parameter a bit, same error.

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets
from torchvision import transforms as T
from torch.utils.data import DataLoader
import torch.optim as optim
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 64
weight_decay = 0.0

epochs = 50
nworkers = 2
lr = 0.001

input_size = 1
embedding_dim = 256
output_size = 10
data_dir = 'data/'

class TransformerModel(nn.Module):
    def __init__(self, input_size, embedding_dim, output_size):
        super(TransformerModel, self).__init__()
        self.embedding_dim = embedding_dim

        self.embedding = nn.Embedding(input_size, embedding_dim)
        self.transformer = nn.Transformer(
            d_model=embedding_dim, nhead=8, num_encoder_layers=6
        )
        self.fc = nn.Linear(embedding_dim, output_size)

    def forward(self, xs):
        embeddings = self.embedding(xs)
        out = self.transformer(xs)  # Pass the embeddings through the Transformer
        out = self.fc(out.mean(dim=0))  # Average embeddings over the sequence dimension
        return out

# ... (rest of the code)


train_dataset = datasets.MNIST(root=data_dir,
                               train=True,
                               transform=T.Compose([T.ToTensor(),]),
                               download=True)

train_loader = DataLoader(dataset=train_dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          drop_last=True)

model = TransformerModel(input_size=input_size, embedding_dim=embedding_dim, output_size=output_size).to(device)
#print(model)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)
loss_f = nn.CrossEntropyLoss()

for epoch in range(epochs):
    model.train()  # Set the model to training mode
    loss_lst = []
    correct = 0
    total = 0

    for batch_idx, (x, y) in enumerate(train_loader):
        x, y = x.to(device), y.to(device)
        x = x.view(-1, batch_size, input_size)  # Reshape input to (sequence_length, batch_size, input_size)'
        x = x.permute(1, 0, 2)  # Permute dimensions for Transformer: (batch_size, sequence_length, input_size)
        x = x.to(torch.long)
        out = model(x)
        loss_val = loss_f(out, y)
        loss_lst.append(float(loss_val.item()))
        optimizer.zero_grad()
        loss_val.backward()
        optimizer.step()

        classprobs = F.softmax(out, dim=1)
        preds = classprobs.argmax(dim=1)
        total += y.size(0)
        correct += (preds == y).sum().item()

    loss_val = round(sum(loss_lst) / len(loss_lst), 4)
    accuracy = round(correct / total, 4)
    print(f"Epoch:{epoch + 1}   Train[Loss:{loss_val}  Accuracy:{accuracy}]")
    scheduler.step()

Yes, sorry. I meant num_embeddings in my previous post which is set to input_size=1:

self.embedding = nn.Embedding(input_size, embedding_dim)

You mean I should set input_size to 256 ?

When I do this we get the error:


---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-1-2fb1aeebf2f4> in <cell line: 59>()
     65     for batch_idx, (x, y) in enumerate(train_loader):
     66         x, y = x.to(device), y.to(device)
---> 67         x = x.view(-1, batch_size, input_size)  # Reshape input to (sequence_length, batch_size, input_size)'
     68         x = x.permute(1, 0, 2)  # Permute dimensions for Transformer: (batch_size, sequence_length, input_size)
     69         x = x.to(torch.long)

RuntimeError: shape '[-1, 64, 256]' is invalid for input of size 50176

Something doesnt seem right tho. Shoudnt the transformer take the entire flatted sequence of 28*28 as input, so input should be 784 ?