Model doeasn't learn when dataset, dataloader and model are initiated in the same function as the training

I have attached the code below.

No matter what size of model, or any hyperparameter change, it doesn’t seem to learn and I can’t work out why. Any help/guidance would be appreciated.

DOESN’T WORK:::

def training_loop(
    num_classes: int,
    model_selection: list,
    num_epochs: int = 10,
    lr: float = 0.1,
    batch_size: int = 64,
    max_seq_len: int = 128,
):
    
    # Device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Device is", device, end="\n\n")
    
    # Mapping
    amino_acids = ["A","C","D","E","F","G","H","I","K","L",
               "M","N","P","Q","R","S","T","V","W","Y","X"]
    mapping = {aa:i + 1 for i, aa in enumerate(amino_acids)}
    mapping.update({'X': 21, 'U': 21, 'B': 21, 'O': 21, 'Z': 21})
    
    # Creating the datasets
    train_dataset = PfamDataset("train", 100, mapping, max_seq_len)
    validation_dataset = PfamDataset("validation", 100, mapping, max_seq_len)
    
    # Creating the dataloaders
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, 
                                                   num_workers=0, shuffle=True)
    validation_dataloader = torch.utils.data.DataLoader(validation_dataset,
                                                        batch_size=batch_size, 
                                                        num_workers=0)
    
    # Initiating the model
    if model_selection[0] == "fc":
        model = FullyConnected(max_seq_len, model_selection[1], num_classes).to(device)
        
    print("Model Design (TF Format)")
    summary(model, input_size=(1, max_seq_len))
    print("\n\n")
    
    # Defining the loss function and optimizer
    loss_function = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

    print("Beginning training...")
    for epoch in range(1, num_epochs + 1):
        
        # ---------- Training ----------
        model.train()

        train_loss = 0.0
        num_correct = 0
        total_examples = 0

        for X, y in train_dataloader:
            X = X.to(device)
            y = y.to(device)

            y_pred = model(X)

            optimizer.zero_grad()
            loss = loss_function(y_pred, y)
            loss.backward()
            optimizer.step()

            train_loss += loss.data.item() 
            num_correct += (y_pred.argmax(axis=1) == y.argmax(axis=1)).sum().item()
            total_examples += X.shape[0]

        train_accuracy = num_correct / total_examples
        train_loss = train_loss / len(train_dataloader.dataset)
    
        # ---------- Evaluating ----------
        model.eval()

        eval_loss = 0.0
        num_correct = 0
        total_examples = 0

        for X, y in validation_dataloader:
            X = X.to(device)
            y = y.to(device)

            y_pred = model(X)

            loss = loss_function(y_pred, y)

            eval_loss += loss.data.item() 
            num_correct += (y_pred.argmax(axis=1) == y.argmax(axis=1)).sum().item()
            total_examples += X.shape[0]

        eval_accuracy = num_correct / total_examples
        eval_loss = eval_loss / len(validation_dataloader.dataset)
        
        if epoch == 1 or epoch % 1 == 0:
            print('Epoch %3d/%3d, train loss: %3.2f, train acc: %3.2f, val loss: %3.2f, val acc: %3.2f' % \
                (epoch, num_epochs, train_loss, train_accuracy, eval_loss, eval_accuracy))


training_loop(model_selection=["fc", 128], num_classes=100, max_seq_len=128, num_epochs=50)

However, if I call the datasets, dataloader etc outside of the function, and then run the loop it works. Why is this?

WORKS:::

num_classes = 100
max_seq_len = 128
batch_size = 64

# Mapping
amino_acids = ["A","C","D","E","F","G","H","I","K","L",
               "M","N","P","Q","R","S","T","V","W","Y","X"]
mapping = {aa:i + 1 for i, aa in enumerate(amino_acids)}
mapping.update({'X': 21, 'U': 21, 'B': 21, 'O': 21, 'Z': 21})

# Mapping
amino_acids = ["A","C","D","E","F","G","H","I","K","L",
               "M","N","P","Q","R","S","T","V","W","Y","X"]
mapping = {aa:i + 1 for i, aa in enumerate(amino_acids)}
mapping.update({'X': 21, 'U': 21, 'B': 21, 'O': 21, 'Z': 21})

# Mapping
amino_acids = ["A","C","D","E","F","G","H","I","K","L",
               "M","N","P","Q","R","S","T","V","W","Y","X"]
mapping = {aa:i + 1 for i, aa in enumerate(amino_acids)}
mapping.update({'X': 21, 'U': 21, 'B': 21, 'O': 21, 'Z': 21})

model = FullyConnected(max_seq_len, 128, num_classes).to(device)

loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

def train_evaluate(
    model, 
    optimizer, 
    loss_function, 
    train_dataloader, 
    val_dataloader, 
    epochs=10, 
    device=device
):
    
    history = {} 
    history['loss'] = []
    history['accuracy'] = []
    history['validation_loss'] = []
    history['validation_accuracy'] = []

    for epoch in range(1, epochs+1):
        # --- TRAIN AND EVALUATE ON TRAINING SET 
        model.train()

        train_loss = 0.0
        num_train_correct = 0
        num_train_examples = 0

        for X, y in train_dataloader:
            X = X.to(device)
            y = y.to(device)
            #print("Input shape:     ", X.shape)
            
            y_pred = model(X)

            optimizer.zero_grad()
            loss = loss_function(y_pred, y)
            loss.backward()
            optimizer.step()

            train_loss += loss.data.item() 
            num_train_correct += (y_pred.argmax(axis=1) == y.argmax(axis=1)).sum().item()
            num_train_examples += X.shape[0]

        train_acc = num_train_correct / num_train_examples
        train_loss = train_loss / len(train_dataloader.dataset)


        # --- EVALUATE ON VALIDATION SET -------------------------------------
        model.eval()
        val_loss = 0.0
        num_val_correct = 0
        num_val_examples = 0

        for X, y in val_dataloader:
            X = X.to(device)
            y = y.to(device)
            y_pred = model(X)
            loss = loss_function(y_pred, y)

            val_loss += loss.data.item()
            num_val_correct += (y_pred.argmax(axis=1) == y.argmax(axis=1)).sum().item()
            num_val_examples += X.shape[0]

        val_acc = num_val_correct / num_val_examples
        val_loss = val_loss / len(val_dataloader.dataset)

        if epoch == 1 or epoch % 1 == 0:
          print('Epoch %3d/%3d, train loss: %5.2f, train acc: %5.2f, val loss: %5.2f, val acc: %5.2f' % \
                (epoch, epochs, train_loss, train_acc, val_loss, val_acc))

        history['loss'].append(train_loss)
        history['accuracy'].append(train_acc)
        history['validation_loss'].append(val_loss)
        history['validation_accuracy'].append(val_acc)

    return history

I cannot reproduce the issue using this executable code snippet:

# 1st approach

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

def training_loop(
    num_classes: int,
    model_selection: list,
    num_epochs: int = 10,
    lr: float = 0.1,
    batch_size: int = 64,
    max_seq_len: int = 128,
):
    # Device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Device is", device, end="\n\n")
    
    
    # Creating the datasets
    train_dataset = TensorDataset(
        torch.sin(torch.linspace(0, 6.3, 100)).unsqueeze(1), 
        torch.where(torch.sin(torch.linspace(0, 6.3, 100)) > 0., 1, 0))

    # Creating the dataloaders
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, 
                                                   num_workers=0, shuffle=True)
    
    model = nn.Sequential(
        nn.Linear(1, 16),
        nn.ReLU(),
        nn.Linear(16, 2),
    ).to(device)
    
    # Defining the loss function and optimizer
    loss_function = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

    print("Beginning training...")
    for epoch in range(1, num_epochs + 1):
        
        # ---------- Training ----------
        model.train()

        train_loss = 0.0
        num_correct = 0
        total_examples = 0

        for X, y in train_dataloader:
            X = X.to(device)
            y = y.to(device)

            y_pred = model(X)

            optimizer.zero_grad()
            loss = loss_function(y_pred, y)
            loss.backward()
            optimizer.step()

            train_loss += loss.data.item() 
            num_correct += (y_pred.argmax(axis=1) == y).sum().item()
            total_examples += X.shape[0]

        train_accuracy = num_correct / total_examples
        train_loss = train_loss / len(train_dataloader.dataset)
    
        
        if epoch == 1 or epoch % 1 == 0:
            print('Epoch %3d/%3d, train loss: %3.2f, train acc: %3.2f' % \
                (epoch, num_epochs, train_loss, train_accuracy))


training_loop(model_selection=["fc", 128], num_classes=100, max_seq_len=128, num_epochs=3)


## 2nd approach

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

device = "cuda" if torch.cuda.is_available() else "cpu"

model = nn.Sequential(
    nn.Linear(1, 16),
    nn.ReLU(),
    nn.Linear(16, 2),
).to(device)

loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.1)

# Creating the datasets
train_dataset = TensorDataset(
    torch.sin(torch.linspace(0, 6.3, 100)).unsqueeze(1), 
    torch.where(torch.sin(torch.linspace(0, 6.3, 100)) > 0., 1, 0))

# Creating the dataloaders
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=64, 
                                               num_workers=0, shuffle=True)

def train_evaluate(
    model, 
    optimizer, 
    loss_function, 
    train_dataloader, 
    val_dataloader, 
    epochs=10, 
    device=device
):
    
    history = {} 
    history['loss'] = []
    history['accuracy'] = []
    history['validation_loss'] = []
    history['validation_accuracy'] = []

    for epoch in range(1, epochs+1):
        # --- TRAIN AND EVALUATE ON TRAINING SET 
        model.train()

        train_loss = 0.0
        num_train_correct = 0
        num_train_examples = 0

        for X, y in train_dataloader:
            X = X.to(device)
            y = y.to(device)
            #print("Input shape:     ", X.shape)
            
            y_pred = model(X)

            optimizer.zero_grad()
            loss = loss_function(y_pred, y)
            loss.backward()
            optimizer.step()

            train_loss += loss.data.item() 
            num_train_correct += (y_pred.argmax(axis=1) == y).sum().item()
            num_train_examples += X.shape[0]

        train_acc = num_train_correct / num_train_examples
        train_loss = train_loss / len(train_dataloader.dataset)


        if epoch == 1 or epoch % 1 == 0:
          print('Epoch %3d/%3d, train loss: %5.2f, train acc: %5.2f' % \
                (epoch, epochs, train_loss, train_acc))

        history['loss'].append(train_loss)
        history['accuracy'].append(train_acc)


    return history

history = train_evaluate(model, optimizer, loss_function, train_dataloader, None, epochs=3, device=device)

Note that I needed to change a few things as your code snippet wasn’t executable.
E.g. the accuracy calculation looks wrong, since you are squeezing the target, which should not have a class dimension and you are also using different learning rates (0.1 vs. 0.001).