How I can improve validation loss in multi class text classification

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.functional import embedding
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from torchtext.data.functional import numericalize_tokens_from_iterator
import pandas as pd
import json
import re
from transformers import BertTokenizer
from nltk.tokenize import RegexpTokenizer
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

class ResumeDataSet(Dataset):
def init(self, file_path, max_seq_length=512):
self.file_path = file_path
self.max_seq_length = max_seq_length

    print("Loading data...")
    self.input_texts, self.labels = self.load_data()

    print("Building vocabulary...")
    self.vocab = self.create_vocabulary()

    print("Creating label mappings...")
    self.label_map = self.create_label_map(self.labels)
    self.labels_encoded = self.create_class_indices(self.labels)

    print("Converting texts to tensors...")
    self.text_tensors = self.convert_texts_to_tensors()

    print("Dataset preparation complete!")

def load_data(self):
    with open(self.file_path, "r") as file:
        data = json.load(file)

    df = pd.DataFrame(data["data"])
    df = df.fillna("")
    df["skills"] = df["skills"].apply(lambda x: " ".join(x) if isinstance(x, list) else x)
    df = df.drop_duplicates()

    input_texts = []
    for _, row in df.iterrows():
        combined_text = (f"{row.get('summary', '')} "
                         f"{row.get('experience', '')} "
                         f"{row.get('skills', '')} "
                         f"{row.get('education', '')}")
        cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', combined_text.lower())
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text.strip())
        input_texts.append(cleaned_text)
    labels = [i.lower() if len(i) > 0 else "unknown" for i in df["designation"]]

    return input_texts, labels

def token_gen(self, text):
    # tokenizer = get_tokenizer("basic_english")
    tokenizer = RegexpTokenizer(r'\w+')
    # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    for sent in text:
        tokens = tokenizer.tokenize(sent)
        yield tokens

def create_vocabulary(self):
    vocab = build_vocab_from_iterator(self.token_gen(self.input_texts), specials=["<UNK>"], max_tokens=5000)
    vocab.set_default_index(vocab["<UNK>"])
    return vocab

def create_label_map(self, labels):
    unique_labels = sorted(set(labels))
    return {label: idx for idx, label in enumerate(unique_labels)}

def create_class_indices(self, labels):
    return torch.tensor([self.label_map[label] for label in labels])

def convert_texts_to_tensors(self):
    sequence = numericalize_tokens_from_iterator(vocab=self.vocab, iterator=self.token_gen(self.input_texts))

    token_ids = []
    for i in range(len(self.input_texts)):
        x = list(next(sequence))
        token_ids.append(x)

    padded_text = pad_sequence([torch.tensor(x) for x in token_ids], batch_first=True,
                               padding_value=self.vocab["<PAD>"])

    if padded_text.size(1) > self.max_seq_length:
        padded_text = padded_text[:, :self.max_seq_length]

    return padded_text

def __len__(self):
    return len(self.text_tensors)

def __getitem__(self, idx):
    return self.text_tensors[idx], self.labels_encoded[idx]

class ResumeModel(nn.Module):
def init(self, vocab_size, num_classes, embed_size=100, hidden_size=128):
super(ResumeModel, self).init()
self.embedding = nn.Embedding(vocab_size, embed_size)
self.lstm = nn.LSTM(embed_size,
hidden_size,
dropout=0.3,
num_layers=2,
batch_first=True,
bidirectional=True) # Bidirectional LSTM
self.batch_norm = nn.LayerNorm(hidden_size * 2) # Adding batch normalization
self.fc1 = nn.Linear(hidden_size * 2, hidden_size)
self.fc2 = nn.Linear(hidden_size, num_classes)
self.dropout = nn.Dropout(0.5) # Increased dropout
self._init_weights()

def _init_weights(self):
    nn.init.xavier_uniform_(self.embedding.weight)
    nn.init.xavier_uniform_(self.fc1.weight)
    nn.init.xavier_uniform_(self.fc2.weight)

    for name, param in self.lstm.named_parameters():
        if 'weight' in name:
            nn.init.xavier_uniform_(param)
        elif 'bias' in name:
            nn.init.zeros_(param)

def forward(self, x):
    embedded = self.embedding(x)
    lstm_out, _ = self.lstm(embedded)
    lstm_out = lstm_out[:, -1, :]  # Taking the last hidden state
    # lstm_out = torch.mean(lstm_out, dim=1)  # Averaging the hidden states
    lstm_out = self.batch_norm(lstm_out)  # Applying batch normalization
    output = self.fc1(lstm_out)
    output = self.dropout(output)  # Applying dropout
    output = self.fc2(output)
    return output

def save_model(model, vocab, label_map, model_path=“resume_model.pth”):
torch.save({
‘model_state_dict’: model.state_dict(),
‘vocab’: vocab,
‘label_map’: label_map,
‘vocab_size’: len(vocab),
‘num_classes’: len(label_map),
}, model_path)
print(f"Model saved to {model_path}****************************************")

def load_model(model_path):
checkpoint = torch.load(model_path)
model = ResumeModel(checkpoint[‘vocab_size’],checkpoint[‘num_classes’], 128, 100)

model.load_state_dict(checkpoint['model_state_dict'])

vocab = checkpoint['vocab']
label_map = checkpoint['label_map']

return model, vocab, label_map

def train_model(model, train_loader, val_loader, criterion, optimizer,
save_path=“resume_model.pth”, num_epochs=50, patience=3):
device = torch.device(“cuda” if torch.cuda.is_available() else “cpu”)
model.to(device)

best_val_loss = float("inf")
epochs_without_improvement = 0
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 3.0, gamma=0.5)
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in train_loader:
        optimizer.zero_grad()
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        # print("-" * 50)
        # print("Actual_labels:", labels)
        # print("Predicted_labels:", torch.argmax(outputs, dim=1))
        # print("-" * 50)
        loss = criterion(outputs, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

    epoch_loss = running_loss / len(train_loader)
    epoch_accuracy = 100 * correct / total

    # Validation phase
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            # print("While evaluate")
            # print("Actual_labels:", labels)
            print("Predicted_labels:", torch.argmax(outputs, dim=1))
            # print("-" * 50)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_correct += (predicted == labels).sum().item()
            val_total += labels.size(0)

    val_loss = val_loss / len(val_loader)
    val_accuracy = 100 * val_correct / val_total
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}, "
          f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
    
    scheduler.step(val_loss)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        save_model(model, train_loader.dataset.vocab, train_loader.dataset.label_map, save_path)
        epochs_without_improvement = 0
    else:
        epochs_without_improvement += 1

    if epochs_without_improvement >= patience:
        print("Early stopping triggered!")
        break

def evaluate(text, model_path, max_seq_length=512):
# tokenizer = get_tokenizer(“basic_english”)
tokenizer = RegexpTokenizer(r’\w+')

model, vocab, label_map = load_model(model_path=model_path)
print(label_map)
# Tokenize the text and convert it to indices
tokens = tokenizer.tokenize(text.lower())
token_ids = [vocab[token] for token in tokens]

# Padding sequence
input_tensor = torch.tensor(token_ids).unsqueeze(0)
if input_tensor.size(1) > max_seq_length:
    input_tensor = input_tensor[:, :max_seq_length]
input_tensor = input_tensor.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# Model inference
model.eval()
with torch.no_grad():
    outputs = model(input_tensor)
    print("Row output:", outputs)
    _, predicted = torch.max(outputs, 1)
    predicted_label = list(label_map.keys())[list(label_map.values()).index(predicted.item())]

return predicted_label

if name == “main”:
# Loading Data
train_data = ResumeDataSet(‘resume_data.json’)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

val_data = ResumeDataSet('test_data.json')
val_loader = DataLoader(val_data, batch_size=64)

# Set model parameters
vocab_size = len(train_data.vocab)
num_classes = len(train_data.label_map)

model = ResumeModel(vocab_size=vocab_size, num_classes=num_classes, embed_size=128, hidden_size=100)

# Define Loss Function and Optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
class_weights = compute_class_weight('balanced', classes=np.unique(train_data.labels_encoded.numpy()),
                                    y=train_data.labels_encoded.numpy())
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)


# Train Model
train_model(model, train_loader, val_loader, criterion, optimizer,
            save_path="rnn_resume_model.pt", num_epochs=500, patience=50)

In the above code i am creating resume classification model but my validation loss does not improve. Any one can help me in that?