Need help with Recurrent lstms

MD_Shahadat_Hossain · January 10, 2025, 8:06am

Okay so i a m facing a problem . I am trying to make a recurrent lstm model to predict the next words. My goal is to make the model like given in the photo

Now for that i am using a dataset with 60000 sentences

I have written so far:

class CFG:
    debug = False
    epochs = 500
    learning_rate = 1.0e-3
    batch_size = 64
    target_cols = ['label']  # target columns
    seed = 42
    num_layers = 20  # Number of LSTM layers (recurrences)


    # Model parameters
    vocab_size = None
    vocabFound_size = None
    embedding_dim = 300
    hidden_dim = 128
    dropout = 0.25
    lstm_dropout_rate = 0.2
    num_lstm_layers = 1
    num_class = None
    device = 'cuda' if torch.cuda.is_available() else 'cpu'  # Use GPU if available

def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(CFG.seed)

import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):
    # Compile regex patterns
    REPLACE_BY_SPACE_RE = re.compile(r'[/(){}\[\]\|@,;]')
    SYMBOLS_RE = re.compile(r'[^0-9a-z #+_]')
    NUMBERS = re.compile(r'\d+')
    STOPWORDS = set(stopwords.words('english')) - {'i'}
    lemmatizer = WordNetLemmatizer()

    # Clean the text
    text = text.lower()  # Lowercase
    text = REPLACE_BY_SPACE_RE.sub(' ', text)  # Remove specific punctuation
    text = SYMBOLS_RE.sub('', text)  # Remove unnecessary symbols
    text = NUMBERS.sub('', text)  # Remove numbers

    # Tokenize, remove stopwords, and lemmatize
    tokens = [word for word in text.split() if word not in STOPWORDS]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens```

Define the preprocessing function for sentences

def preprocess_sentences(sentences, max_len=20):
processed_sentences =
for sentence in sentences:
tokens = preprocess_text(sentence) # Clean and tokenize the text
if len(tokens) > max_len:
tokens = tokens[:max_len] # Trim to max_len
else:
tokens += [“EOL”] * (max_len - len(tokens)) # Pad with EOL
processed_sentences.append(tokens)
return processed_sentences

train_processed = preprocess_sentences(train[‘text’].tolist())
valid_processed = preprocess_sentences(valid[‘text’].tolist())
test_processed = preprocess_sentences(test[‘text’].tolist())```

def make_vocabulary_from_tokens(tokenized_sentences, min_doc_freq = 3,max_doc_freq = 100):

    # Count frequency of each token in dataset
    document_freq = {}
    for tokenized_sentence in tokenized_sentences:
        for token in tokenized_sentence:
            document_freq[token] = document_freq.get(token, 0) + 1

    # Discard tokens with freq < min_doc_freq
    qualified_tokens = {
        token: freq for token, freq in document_freq.items() if (min_doc_freq < freq < max_doc_freq)
    }

    # Add in token_ids for each token
    vocab = {token: token_id+2 for token_id, token in enumerate(qualified_tokens.keys())}

    # Add special tokens`Preformatted text`
    vocab['EOL'] = 0
    vocab['[UNK]'] = 1

    return vocab, qualified_tokens

vocab, doc_freq = make_vocabulary_from_tokens(train_processed, 0)    # use only train set for this
print(f'{len(vocab)=}')```

CFG.vocab_size = len(vocab) # Number of unique tokens in the vocabulary
CFG.embedding_dim = 300 # Dimension of Word2Vec embeddings
pretrained_w2v = KeyedVectors.load_word2vec_format(‘GoogleNews-vectors-negative300.bin’, binary=True)```

embedding_matrix = np.zeros((CFG.vocab_size, CFG.embedding_dim))

# Initialize embeddings for tokens in the vocabulary
for token, idx in vocab.items():
    # Skip initializing embeddings for padding tokens or special tokens
    if token == 'EOL':  # Padding token
        embedding_matrix[idx] = np.zeros(CFG.embedding_dim)  # Zero vector for padding
    elif token == '[UNK]':  # Unknown token
        embedding_matrix[idx] = np.random.normal(0, 0.1, CFG.embedding_dim)  # Random initialization for unknown tokens
    else:
        # For all other tokens in vocab, use pretrained embeddings if available
        if token in pretrained_w2v:
            embedding_matrix[idx] = pretrained_w2v[token]
        else:
            embedding_matrix[idx] = np.random.normal(0, 0.1, CFG.embedding_dim)  # Random initialization for out-of-vocab tokens

# Now, your embedding matrix is ready to be used

def tokens_to_ids(tokenized_sentences, vocab):
    token_ids = []
    for sentence in tokenized_sentences:
        ids = [vocab.get(token, vocab['[UNK]']) for token in sentence]
        token_ids.append(ids)
    return token_ids

train_ids = tokens_to_ids(train_processed, vocab)
valid_ids = tokens_to_ids(valid_processed, vocab)
test_ids = tokens_to_ids(test_processed, vocab)

# Convert to PyTorch tensors
train_tensor = torch.tensor(train_ids, dtype=torch.long)
valid_tensor = torch.tensor(tokens_to_ids(valid_processed, vocab), dtype=torch.long)
test_tensor = torch.tensor(tokens_to_ids(test_processed, vocab), dtype=torch.long)

train_dataset = TensorDataset(train_tensor)
valid_dataset = TensorDataset(valid_tensor)
test_dataset = TensorDataset(test_tensor)

train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=CFG.batch_size)
test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size)

import torch
import torch.nn as nn
import torch.nn.functional as F

class RecurrentLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, pretrained_embeddings):
        super(RecurrentLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(pretrained_embeddings, dtype=torch.float32))
        self.embedding.weight.requires_grad = True  
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, inputs, hidden=None):
        embedded = self.embedding(inputs)  # Shape: (batch_size, seq_length, embedding_dim)
        batch_size, seq_length, _ = embedded.shape
        outputs = []
        for t in range(seq_length):
            inp = embedded[:, t, :].unsqueeze(1)  # Shape: (batch_size, 1, embedding_dim)
            out, hidden = self.lstm(inp, hidden)  # out: (batch_size, 1, hidden_dim)
            next_word_logits = self.fc(out.squeeze(1))  # Shape: (batch_size, vocab_size)
            outputs.append(next_word_logits)
        outputs = torch.stack(outputs, dim=1)  # Shape: (batch_size, seq_length, vocab_size)
        return outputs, hidden

def predict_sentence(model, sentence, vocab, max_len=20):
    """
    Predicts the next 20 words for a given input sentence using the model.
    
    Args:
        model: Trained LSTM model.
        sentence: Input sentence (str).
        vocab: Vocabulary mapping (token to index).
        max_len: Fixed length for both input and output sequences.
        
    Returns:
        Predicted sentence (str) of exactly 20 words.
    """
    model.eval()  # Set the model to evaluation mode
    
    # Preprocess the input sentence
    tokens = preprocess_text(sentence)[:max_len]  # Trim to max_len
    tokens += ["EOL"] * (max_len - len(tokens))  # Pad with "EOL" to max_len
    
    # Convert tokens to IDs
    input_ids = torch.tensor([[vocab.get(token, vocab['[UNK]']) for token in tokens]], device=CFG.device)
    
    predictions = []  # List to store predicted tokens
    hidden = None  # Initialize LSTM hidden states

    # Generate 20 output words
    for _ in range(max_len):
        # Forward pass through the model
        outputs, hidden = model(input_ids, hidden)
        next_word_logits = outputs[:, -1, :]  # Get logits for the last predicted word
        predicted_id = next_word_logits.argmax(1).item()  # Get the most probable word ID
        predictions.append(predicted_id)  # Save the predicted word ID

        # Prepare input for the next prediction step
        input_ids = torch.tensor([[predicted_id]], device=CFG.device)

    # Convert predicted IDs back to tokens
    predicted_tokens = [list(vocab.keys())[list(vocab.values()).index(idx)] for idx in predictions]

    # Return the joined predicted tokens as a single sentence
    return " ".join(predicted_tokens)

from sklearn.metrics import accuracy_score

# Move the embedding matrix to the device
embedding_matrix_tensor = torch.tensor(embedding_matrix, dtype=torch.float32).to(CFG.device)

# Define the model and set its embedding weights
model = RecurrentLSTM(CFG.vocab_size, CFG.embedding_dim, CFG.hidden_dim, pretrained_embeddings=embedding_matrix).to(CFG.device)
model.embedding.weight.data.copy_(embedding_matrix_tensor)
model.embedding.weight.requires_grad = True  # Allow fine-tuning of embeddings

# Define optimizer and loss function
criterion = nn.CrossEntropyLoss(ignore_index=vocab['EOL'])  # Ignore padding in loss calculation
optimizer = optim.Adam(model.parameters(), lr=CFG.learning_rate)```

for epoch in range(CFG.epochs):
model.train()
total_loss = 0
num_batches = 0

for batch in train_loader:
    inputs = batch[0].to(CFG.device)
    targets = inputs[:, 1:]
    inputs = inputs[:, :-1]

    optimizer.zero_grad()
    outputs, _ = model(inputs)
    outputs = outputs.reshape(-1, CFG.vocab_size)
    targets = targets.reshape(-1)

    loss = criterion(outputs, targets)
    loss.backward()

    # Gradient clipping
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
    optimizer.step()

    total_loss += loss.item()
    num_batches += 1

avg_train_loss = total_loss / num_batches
print(f"Epoch {epoch + 1}: Average Training Loss: {avg_train_loss:.4f}")

# Validation
model.eval()
all_targets, all_predictions = [], []

with torch.no_grad():
    for batch in valid_loader:
        inputs = batch[0].to(CFG.device)
        targets = inputs[:, 1:]
        inputs = inputs[:, :-1]

        outputs, _ = model(inputs)
        predictions = outputs.argmax(dim=-1)

        # Filter `[PAD]`, `[UNK]`, `EOL`
        mask = (targets != vocab['[UNK]']) & (targets != vocab['EOL'])
        filtered_targets = targets[mask]
        filtered_predictions = predictions[mask]

        all_targets.extend(filtered_targets.cpu().numpy())
        all_predictions.extend(filtered_predictions.cpu().numpy())

val_accuracy = accuracy_score(all_targets, all_predictions)
print(f"Epoch {epoch + 1}: Validation Accuracy: {val_accuracy:.4f}")

# Example prediction
if epoch == 0 or (epoch + 1) % 5 == 0:
    example_sentence = "A"
    predicted_sentence = predict_sentence(model, example_sentence, vocab)
    print(f"Example Prediction after Epoch {epoch + 1}:")
    print(f"Input Sentence: {example_sentence}")
    print(f"Predicted Sentence: {predicted_sentence}")


But is Giving very bad result like 
Input Sentence: A
Predicted Sentence: recover [UNK] [UNK] [UNK] surprise set gift real [UNK] strongly daily basis big drug [UNK] chance told [UNK] [UNK] [UNK]
Epoch 496: Average Training Loss: 1.0183
Epoch 496: Validation Accuracy: 0.0039
Epoch 497: Average Training Loss: 1.0294
Epoch 497: Validation Accuracy: 0.0043
Epoch 498: Average Training Loss: 1.0322
Epoch 498: Validation Accuracy: 0.0045
Epoch 499: Average Training Loss: 1.0224
Epoch 499: Validation Accuracy: 0.0036
Epoch 500: Average Training Loss: 1.0140
Epoch 500: Validation Accuracy: 0.0040
Example Prediction after Epoch 500:
Input Sentence: A
Predicted Sentence: given daily accumulation [UNK] surprise across top tool real [UNK] shut [UNK] brought stealing hand gentle hand chaos [UNK] [UNK]