Training loop crashing due to loss.backwards function

I have been trying to train a lstm model with lyrics data I found on Kaggle. When I first started with this architecture, I had glove embeddings which worked fine with the model and how I shaped it. I switched totokenization and then z scoring my input parameters and all of a sudden my program hangs and then closes out without any error, I only found out about the error when I checked my Event viewer and found this:

Faulting application name: python.exe, version: 3.12.2150.1013, time stamp: 0x65c2a4c7
Faulting module name: c10.dll, version:, time stamp: 0x65a00f1b
Exception code: 0xc0000005
Fault offset: 0x0000000000060bb5
Faulting process id: 0x0x35EC
Faulting application start time: 0x0x1DA5E8859C9145E
Faulting application path: C:\Users\rohun\AppData\Local\Programs\Python\Python312\python.exe
Faulting module path: C:\Users\rohun\final_train\env\Lib\site-packages\torch\lib\c10.dll
Report Id: 8d624ff4-59a7-448f-b320-6e7aeed0ecd4
Faulting package full name:
Faulting package-relative application ID:

I’ve tried a lot of different thigns to fix like changing model architecture, data preprocessing, hardware limitations, and such, but I haven’t found a single thing that could be casuing me this problem. ANy help much appreciated.

For reference this is my code:
import pandas as pd
import string
import torch
import torchtext
import random
import tracemalloc
from import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
from operator import itemgetter
from memory_profiler import profile

if torch.cuda.is_available():
device = torch.device(“cuda”)
print(“CUDA is available.”)
device = torch.device(“cpu”)
print(“CUDA is not available. Running on CPU.”)

class EarlyStopping:
def init(self, patience=5, min_delta=0.0):
self.patience = patience
self.min_delta = min_delta
self.counter = 0
self.best_loss = float(‘inf’)

def __call__(self, val_loss):
    if val_loss < self.best_loss - self.min_delta:
        self.best_loss = val_loss
        self.counter = 0
        self.counter += 1
        if self.counter >= self.patience:
            return True  # Stop training
    return False
def reset(self):
    self.counter = 0
    self.best_loss = float('inf')

Grid Search

batch_sizes = [32] # try with 64 and 128
hidden_lstm_sizes = [25] # if it does better wiht the 512 lets use 1024 and a higher value as well, should give us 4 concrete valuesto work with.
hidden_linear_sizes = [124] # 512
epochs = 8 # max number of epochs before overfitting
drop_percentages = [0] # better values with increased dropout percentage, 0.25 working well so lets keep it
learning_rates = [.01]
weight_decays = [0] # 0, 0.10, 0.25

class LSTM(nn.Module):

def __init__(self, embedding_dim, hidden_dim, hidden1dim, output_dim, dropout_dim,sequence_len):
    super(LSTM, self).__init__()
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=False)
    self.hidden = nn.Linear(hidden_dim, hidden1dim)
    self.output = nn.Linear(hidden_dim, output_dim)
    self.drop = nn.Dropout(dropout_dim)

def forward(self, data):
    lstm_out, _ = self.lstm(data)
    #hidden_out = self.hidden(lstm_out)
    #drop_out = self.drop(hidden_out)
    logits = self.output(lstm_out)
    return logits

class data_set(Dataset):
def init(self,X_data,Y_data):
self.X_data = X_data
self.Y_data = Y_data

def __len__(self): 
    return len(self.X_data) 

def __getitem__(self, index):
    return self.X_data[index], self.Y_data[index]

early_stopper = EarlyStopping(patience=3, min_delta=0.0025)
loss_fn = torch.nn.CrossEntropyLoss()

def loop(training_data_loader, validation_data_loader, testing_data_loader,optimizer,epochs,model):
epoch_losses = {}
earlyStopping = False


#c_s = len(training_data_loader) // 10
for epoch in range(epochs):
    if earlyStopping:
    epoch_losses[epoch] = []
    print("Epoch: " + str(epoch))
    total_loss = 0
    running_loss = 0
    for i,batch in enumerate(training_data_loader):
        x_i, y_i = batch[0],batch[1]
        logits = model(x_i)
        loss = loss_fn(logits, y_i)


    total_loss = 0
    running_loss = 0

    for i,batch in enumerate(validation_data_loader):
        x_i, y_i = batch[0],batch[1]

        # forward pass
        with torch.no_grad():
            logits = model(x_i)
        loss = loss_fn(logits, y_i)
        total_loss += loss
        running_loss += loss

        # check early stopping here?

    print("Total Validation Average Loss: " + str((total_loss/len(validation_data_loader)).item()))
    if early_stopper(total_loss/len(validation_data_loader)):
        print(f"Early stopping at epoch {epoch}")
        earlyStopping = True

    total_loss = 0
    running_loss = 0

    for i,batch in enumerate(testing_data_loader):
        x_i, y_i = batch[0],batch[1]

        # forward pass
        with torch.no_grad():
            logits = model(x_i)
        loss = loss_fn(logits, y_i)
        total_loss += loss
        running_loss += loss

        # check early stopping here?

    print("Total Testing Average Loss: " + str((total_loss/len(testing_data_loader)).item()))

return epoch_losses

read data, pre process data

df = pd.read_csv(‘spotify_millsongdata.csv’)
#df = df.loc[(df[‘artist’] == ‘ABBA’) | (df[‘artist’] == ‘Donna Summer’) | (df[‘artist’] == ‘Bob Dylan’)]
df = df.sample(n=1)
df = df[‘text’]

print("Size of dataset: " + str(len(df)))

table = str.maketrans(‘’, ‘’, string.punctuation)

words_set = set()
number_to_words_dictionary = {}
words_to_numbers_dictionary = {}

def preProcess(x):
x = x.strip()
x = x.lower()
x = x.splitlines()

for i in range(len(x)):
    x[i] = x[i].lower()
    x[i] = x[i].translate(table)
    x[i] = x[i].split()

return x

print(“Preprocessing staring…”)
data =
for index, row in df.items():

words_set = list(sorted(words_set))
for i,word in enumerate(words_set):
number_to_words_dictionary[i] = word
words_to_numbers_dictionary[word] = i

max_len = max(len(x) for x in data) - 1

Load GloVe embeddings

#glove = torchtext.vocab.GloVe(name=“6B”, dim=50)

#convert data into X and y data, with X being embeddings and y as numbers
X =
Y =

convert into word embeddings

torch size to be (max_size, 50)

window_size = 3

print(“token embeddings starting …”)

for i in range(len(data)):
update = torch.tensor()
for j in range(len(data[i]) - window_size):
keys = data[i][:j+window_size]

print(“token embeddings completed…”)

Pad the sequences

print(“padding sequences starting…”)
padded_X = pad_sequence(X, batch_first=True).float()

Z score

mean = padded_X.mean(dim=None)
std = padded_X.std(dim=None)

Z-score normalization

padded_X = (padded_X - mean) / std


Y = torch.tensor(Y).long()

print(“padding sequences ending…”)

split training data into training, validation and testing

creates dataset

dataset = data_set(padded_X,Y)

train_size = int(0.7 * len(dataset))
test_size = int(0.2 * len(dataset))
val_size = len(dataset) - train_size - test_size

Split the dataset

train_dataset, test_dataset, val_dataset = random_split(dataset, [train_size, test_size, val_size])

print("Preprocessing Completed … ")


classes_size = len(number_to_words_dictionary)

#accuracy = Accuracy(task=“multiclass”, num_classes=classes_size)

starts the whole loop

sequence_length = padded_X.size()[1]
input_dim = 1

with open(“readme.txt”, “w”) as f:
for batch in batch_sizes:

# Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch, shuffle=True)

    for hidden_lstm in hidden_lstm_sizes:
        for hidden_linear in hidden_linear_sizes:
            for drop_p in drop_percentages:
                # creates model
                model = LSTM(input_dim, hidden_lstm, hidden_linear, classes_size, drop_p,sequence_len=sequence_length)


                for lr in learning_rates:
                    for wd in weight_decays:
                # runs training loop
                        optimizer = torch.optim.Adam(model.parameters(), lr=lr,weight_decay=wd)
                        scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3)
                        name = f"batch: {batch}, hidden_lstm: {hidden_lstm}, hidden_linear: {hidden_linear}, drop_percentage: {drop_p}, learning_rate: {lr}, weight_decay: {wd}"
                        val= loop(train_loader,val_loader,test_loader,optimizer,epochs=epochs,model=model)
                        f.write(name + "\n")
                        for key,value in val.items():
                            f.write(f"| At epoch: {key} training_loss = {value[0]} validation_loss = {value[1]} testing_loss = {value[2]}" + "\n")
                            print(f"|   At epoch: {key}     |   training_loss = {value[0]}        |   validation_loss = {value[1]}        |   testing_loss = {value[2]}        |")
                        val = None
                        del val
                        del optimizer
                        del scheduler
                del model
    train_loader = None
    val_loader= None
    test_loader = None
    del train_loader
    del val_loader
    del test_loader