import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.functional import embedding
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from torchtext.data.functional import numericalize_tokens_from_iterator
import pandas as pd
import json
import re
from transformers import BertTokenizer
from nltk.tokenize import RegexpTokenizer
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
class ResumeDataSet(Dataset):
def init(self, file_path, max_seq_length=512):
self.file_path = file_path
self.max_seq_length = max_seq_length
print("Loading data...")
self.input_texts, self.labels = self.load_data()
print("Building vocabulary...")
self.vocab = self.create_vocabulary()
print("Creating label mappings...")
self.label_map = self.create_label_map(self.labels)
self.labels_encoded = self.create_class_indices(self.labels)
print("Converting texts to tensors...")
self.text_tensors = self.convert_texts_to_tensors()
print("Dataset preparation complete!")
def load_data(self):
with open(self.file_path, "r") as file:
data = json.load(file)
df = pd.DataFrame(data["data"])
df = df.fillna("")
df["skills"] = df["skills"].apply(lambda x: " ".join(x) if isinstance(x, list) else x)
df = df.drop_duplicates()
input_texts = []
for _, row in df.iterrows():
combined_text = (f"{row.get('summary', '')} "
f"{row.get('experience', '')} "
f"{row.get('skills', '')} "
f"{row.get('education', '')}")
cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', combined_text.lower())
cleaned_text = re.sub(r'\s+', ' ', cleaned_text.strip())
input_texts.append(cleaned_text)
labels = [i.lower() if len(i) > 0 else "unknown" for i in df["designation"]]
return input_texts, labels
def token_gen(self, text):
# tokenizer = get_tokenizer("basic_english")
tokenizer = RegexpTokenizer(r'\w+')
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
for sent in text:
tokens = tokenizer.tokenize(sent)
yield tokens
def create_vocabulary(self):
vocab = build_vocab_from_iterator(self.token_gen(self.input_texts), specials=["<UNK>"], max_tokens=5000)
vocab.set_default_index(vocab["<UNK>"])
return vocab
def create_label_map(self, labels):
unique_labels = sorted(set(labels))
return {label: idx for idx, label in enumerate(unique_labels)}
def create_class_indices(self, labels):
return torch.tensor([self.label_map[label] for label in labels])
def convert_texts_to_tensors(self):
sequence = numericalize_tokens_from_iterator(vocab=self.vocab, iterator=self.token_gen(self.input_texts))
token_ids = []
for i in range(len(self.input_texts)):
x = list(next(sequence))
token_ids.append(x)
padded_text = pad_sequence([torch.tensor(x) for x in token_ids], batch_first=True,
padding_value=self.vocab["<PAD>"])
if padded_text.size(1) > self.max_seq_length:
padded_text = padded_text[:, :self.max_seq_length]
return padded_text
def __len__(self):
return len(self.text_tensors)
def __getitem__(self, idx):
return self.text_tensors[idx], self.labels_encoded[idx]
class ResumeModel(nn.Module):
def init(self, vocab_size, num_classes, embed_size=100, hidden_size=128):
super(ResumeModel, self).init()
self.embedding = nn.Embedding(vocab_size, embed_size)
self.lstm = nn.LSTM(embed_size,
hidden_size,
dropout=0.3,
num_layers=2,
batch_first=True,
bidirectional=True) # Bidirectional LSTM
self.batch_norm = nn.LayerNorm(hidden_size * 2) # Adding batch normalization
self.fc1 = nn.Linear(hidden_size * 2, hidden_size)
self.fc2 = nn.Linear(hidden_size, num_classes)
self.dropout = nn.Dropout(0.5) # Increased dropout
self._init_weights()
def _init_weights(self):
nn.init.xavier_uniform_(self.embedding.weight)
nn.init.xavier_uniform_(self.fc1.weight)
nn.init.xavier_uniform_(self.fc2.weight)
for name, param in self.lstm.named_parameters():
if 'weight' in name:
nn.init.xavier_uniform_(param)
elif 'bias' in name:
nn.init.zeros_(param)
def forward(self, x):
embedded = self.embedding(x)
lstm_out, _ = self.lstm(embedded)
lstm_out = lstm_out[:, -1, :] # Taking the last hidden state
# lstm_out = torch.mean(lstm_out, dim=1) # Averaging the hidden states
lstm_out = self.batch_norm(lstm_out) # Applying batch normalization
output = self.fc1(lstm_out)
output = self.dropout(output) # Applying dropout
output = self.fc2(output)
return output
def save_model(model, vocab, label_map, model_path=“resume_model.pth”):
torch.save({
‘model_state_dict’: model.state_dict(),
‘vocab’: vocab,
‘label_map’: label_map,
‘vocab_size’: len(vocab),
‘num_classes’: len(label_map),
}, model_path)
print(f"Model saved to {model_path}****************************************")
def load_model(model_path):
checkpoint = torch.load(model_path)
model = ResumeModel(checkpoint[‘vocab_size’],checkpoint[‘num_classes’], 128, 100)
model.load_state_dict(checkpoint['model_state_dict'])
vocab = checkpoint['vocab']
label_map = checkpoint['label_map']
return model, vocab, label_map
def train_model(model, train_loader, val_loader, criterion, optimizer,
save_path=“resume_model.pth”, num_epochs=50, patience=3):
device = torch.device(“cuda” if torch.cuda.is_available() else “cpu”)
model.to(device)
best_val_loss = float("inf")
epochs_without_improvement = 0
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 3.0, gamma=0.5)
for epoch in range(num_epochs):
model.train()
running_loss = 0.0
correct = 0
total = 0
for inputs, labels in train_loader:
optimizer.zero_grad()
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
# print("-" * 50)
# print("Actual_labels:", labels)
# print("Predicted_labels:", torch.argmax(outputs, dim=1))
# print("-" * 50)
loss = criterion(outputs, labels)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
running_loss += loss.item()
_, predicted = torch.max(outputs, 1)
correct += (predicted == labels).sum().item()
total += labels.size(0)
epoch_loss = running_loss / len(train_loader)
epoch_accuracy = 100 * correct / total
# Validation phase
model.eval()
val_loss = 0.0
val_correct = 0
val_total = 0
with torch.no_grad():
for inputs, labels in val_loader:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
# print("While evaluate")
# print("Actual_labels:", labels)
print("Predicted_labels:", torch.argmax(outputs, dim=1))
# print("-" * 50)
loss = criterion(outputs, labels)
val_loss += loss.item()
_, predicted = torch.max(outputs, 1)
val_correct += (predicted == labels).sum().item()
val_total += labels.size(0)
val_loss = val_loss / len(val_loader)
val_accuracy = 100 * val_correct / val_total
print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}, "
f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
scheduler.step(val_loss)
if val_loss < best_val_loss:
best_val_loss = val_loss
save_model(model, train_loader.dataset.vocab, train_loader.dataset.label_map, save_path)
epochs_without_improvement = 0
else:
epochs_without_improvement += 1
if epochs_without_improvement >= patience:
print("Early stopping triggered!")
break
def evaluate(text, model_path, max_seq_length=512):
# tokenizer = get_tokenizer(“basic_english”)
tokenizer = RegexpTokenizer(r’\w+')
model, vocab, label_map = load_model(model_path=model_path)
print(label_map)
# Tokenize the text and convert it to indices
tokens = tokenizer.tokenize(text.lower())
token_ids = [vocab[token] for token in tokens]
# Padding sequence
input_tensor = torch.tensor(token_ids).unsqueeze(0)
if input_tensor.size(1) > max_seq_length:
input_tensor = input_tensor[:, :max_seq_length]
input_tensor = input_tensor.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
# Model inference
model.eval()
with torch.no_grad():
outputs = model(input_tensor)
print("Row output:", outputs)
_, predicted = torch.max(outputs, 1)
predicted_label = list(label_map.keys())[list(label_map.values()).index(predicted.item())]
return predicted_label
if name == “main”:
# Loading Data
train_data = ResumeDataSet(‘resume_data.json’)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_data = ResumeDataSet('test_data.json')
val_loader = DataLoader(val_data, batch_size=64)
# Set model parameters
vocab_size = len(train_data.vocab)
num_classes = len(train_data.label_map)
model = ResumeModel(vocab_size=vocab_size, num_classes=num_classes, embed_size=128, hidden_size=100)
# Define Loss Function and Optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
class_weights = compute_class_weight('balanced', classes=np.unique(train_data.labels_encoded.numpy()),
y=train_data.labels_encoded.numpy())
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
# Train Model
train_model(model, train_loader, val_loader, criterion, optimizer,
save_path="rnn_resume_model.pt", num_epochs=500, patience=50)
In the above code i am creating resume classification model but my validation loss does not improve. Any one can help me in that?