Embedding Question for RNN

Hello,

I am trying to adapt the code below used in language translation (german->english) for 2 class classification of points. I want the decisions of each point to be very closely tied together, hence why I am using this network. My question deals with the sizes of the decoders & embeddings. My current error is this:
IndexError: index out of range in self
at
output = model(torch.from_numpy(np.abs(inp_data1000)).to(torch.int64), torch.from_numpy(np.abs(target1000)).to(torch.int64))

I am almost positive the error involves this: in the forward function for the model for the original code, the parameters it takes in have already been embedded. In my modifications, I cannot find where to implement this (encoding the data before passing it into the forward function of the model). It probably has to do something with the iterator, but im not sure. I am passing in negative values to the forward function currently, which of course gives me an error.

My input data is in the range from -3 to 3, and im trying to use 3 decimal point accuracy, so I let the input encoding be a dictionary size of 6000, while the output encoding is 2 because im classifying between two points. Any help would be really appreciated.

Code I am adapting:
source: aladdinpersson (Aladdin Persson) · GitHub
Code:
import torch

import torch.nn as nn

import torch.optim as optim

from torchtext.datasets import Multi30k

from torchtext.data import Field, BucketIterator

import numpy as np

import spacy

import random

from torch.utils.tensorboard import SummaryWriter # to print to tensorboard

from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint

spacy_ger = spacy.load(“de”)

spacy_eng = spacy.load(“en”)

def tokenize_ger(text):

return [tok.text for tok in spacy_ger.tokenizer(text)]

def tokenize_eng(text):

return [tok.text for tok in spacy_eng.tokenizer(text)]

german = Field(tokenize=tokenize_ger, lower=True, init_token="", eos_token="")

english = Field(

tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>"

)

train_data, valid_data, test_data = Multi30k.splits(

exts=(".de", ".en"), fields=(german, english)

)

german.build_vocab(train_data, max_size=10000, min_freq=2)

english.build_vocab(train_data, max_size=10000, min_freq=2)

class Encoder(nn.Module):

def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):

    super(Encoder, self).__init__()

    self.dropout = nn.Dropout(p)

    self.hidden_size = hidden_size

    self.num_layers = num_layers

    self.embedding = nn.Embedding(input_size, embedding_size)

    self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)

def forward(self, x):

    # x shape: (seq_length, N) where N is batch size

    embedding = self.dropout(self.embedding(x))

    # embedding shape: (seq_length, N, embedding_size)

    outputs, (hidden, cell) = self.rnn(embedding)

    # outputs shape: (seq_length, N, hidden_size)

    return hidden, cell

class Decoder(nn.Module):

def __init__(

    self, input_size, embedding_size, hidden_size, output_size, num_layers, p

):

    super(Decoder, self).__init__()

    self.dropout = nn.Dropout(p)

    self.hidden_size = hidden_size

    self.num_layers = num_layers

    self.embedding = nn.Embedding(input_size, embedding_size)

    self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)

    self.fc = nn.Linear(hidden_size, output_size)

def forward(self, x, hidden, cell):

    # x shape: (N) where N is for batch size, we want it to be (1, N), seq_length

    # is 1 here because we are sending in a single word and not a sentence

    x = x.unsqueeze(0)

    embedding = self.dropout(self.embedding(x))

    # embedding shape: (1, N, embedding_size)

    outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))

    # outputs shape: (1, N, hidden_size)

    predictions = self.fc(outputs)

    # predictions shape: (1, N, length_target_vocabulary) to send it to

    # loss function we want it to be (N, length_target_vocabulary) so we're

    # just gonna remove the first dim

    predictions = predictions.squeeze(0)

    return predictions, hidden, cell

class Seq2Seq(nn.Module):

def __init__(self, encoder, decoder):

    super(Seq2Seq, self).__init__()

    self.encoder = encoder

    self.decoder = decoder

def forward(self, source, target, teacher_force_ratio=0.5):

    batch_size = source.shape[1]

    target_len = target.shape[0]

    target_vocab_size = len(english.vocab)

    outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

    hidden, cell = self.encoder(source)

    # Grab the first input to the Decoder which will be <SOS> token

    x = target[0]

    for t in range(1, target_len):

        # Use previous hidden, cell as context from encoder at start

        output, hidden, cell = self.decoder(x, hidden, cell)

        # Store next output prediction

        outputs[t] = output

        # Get the best word the Decoder predicted (index in the vocabulary)

        best_guess = output.argmax(1)

        # With probability of teacher_force_ratio we take the actual next word

        # otherwise we take the word that the Decoder predicted it to be.

        # Teacher Forcing is used so that the model gets used to seeing

        # similar inputs at training and testing time, if teacher forcing is 1

        # then inputs at test time might be completely different than what the

        # network is used to. This was a long comment.

        x = target[t] if random.random() < teacher_force_ratio else best_guess

    return outputs

We’re ready to define everything we need for training our Seq2Seq model

Training hyperparameters

num_epochs = 100

learning_rate = 0.001

batch_size = 64

Model hyperparameters

load_model = False

device = torch.device(“cuda” if torch.cuda.is_available() else “cpu”)

input_size_encoder = len(german.vocab)

input_size_decoder = len(english.vocab)

output_size = len(english.vocab)

encoder_embedding_size = 300

decoder_embedding_size = 300

hidden_size = 1024 # Needs to be the same for both RNN’s

num_layers = 2

enc_dropout = 0.5

dec_dropout = 0.5

Tensorboard to get nice loss plot

writer = SummaryWriter(f"runs/loss_plot")

step = 0

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(

(train_data, valid_data, test_data),

batch_size=batch_size,

sort_within_batch=True,

sort_key=lambda x: len(x.src),

device=device,

)

encoder_net = Encoder(

input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout

).to(device)

decoder_net = Decoder(

input_size_decoder,

decoder_embedding_size,

hidden_size,

output_size,

num_layers,

dec_dropout,

).to(device)

model = Seq2Seq(encoder_net, decoder_net).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = english.vocab.stoi[""]

criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

if load_model:

load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)

sentence = “ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen.”

for epoch in range(num_epochs):

print(f"[Epoch {epoch} / {num_epochs}]")

checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}

save_checkpoint(checkpoint)

model.eval()

translated_sentence = translate_sentence(

    model, sentence, german, english, device, max_length=50

)

print(f"Translated example sentence: \n {translated_sentence}")

model.train()

for batch_idx, batch in enumerate(train_iterator):

    # Get input and targets and get to cuda

    inp_data = batch.src.to(device)

    target = batch.trg.to(device)

    # Forward prop

    output = model(inp_data, target)

    # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss

    # doesn't take input in that form. For example if we have MNIST we want to have

    # output to be: (N, 10) and targets just (N). Here we can view it in a similar

    # way that we have output_words * batch_size that we want to send in into

    # our cost function, so we need to do some reshapin. While we're at it

    # Let's also remove the start token while we're at it

    output = output[1:].reshape(-1, output.shape[2])

    target = target[1:].reshape(-1)

    optimizer.zero_grad()

    loss = criterion(output, target)

    # Back prop

    loss.backward()

    # Clip to avoid exploding gradient issues, makes sure grads are

    # within a healthy range

    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

    # Gradient descent step

    optimizer.step()

    # Plot to tensorboard

    writer.add_scalar("Training loss", loss, global_step=step)

    step += 1

score = bleu(test_data[1:100], model, german, english, device)

print(f"Bleu score {score*100:.2f}")

Now, code I am currently using that is giving me an error:
#Current setup for seq2seq rnn

import torch

import torch.nn as nn

import torch.optim as optim

import numpy as np

import spacy

import random

train_data = data

train_data = np.round(train_data,3)

train_targets = arrangements

class Encoder(nn.Module):

def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):

    super(Encoder, self).__init__()

    self.dropout = nn.Dropout(p)

    self.hidden_size = hidden_size

    self.num_layers = num_layers

    self.embedding = nn.Embedding(input_size, embedding_size)

    self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)

def forward(self, x):

    # x shape: (seq_length, N) where N is batch size

    # print(x.shape)

    # print(torch.max(x))

    # print(torch.min(x))     

    embedding = self.dropout(self.embedding(x))

    outputs, (hidden, cell) = self.rnn(embedding)

    return hidden, cell

class Decoder(nn.Module):

def __init__(

    self, input_size, embedding_size, hidden_size, output_size, num_layers, p

):

    super(Decoder, self).__init__()

    self.dropout = nn.Dropout(p)

    self.hidden_size = hidden_size

    self.num_layers = num_layers

    self.embedding = nn.Embedding(input_size, embedding_size)

    self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)

    self.fc = nn.Linear(hidden_size, output_size)

def forward(self, x, hidden, cell):

    # x shape: (N) where N is for batch size, we want it to be (1, N), seq_length

    # is 1 here because we are sending in a single word and not a sentence

    x = x.unsqueeze(0)

    embedding = self.dropout(self.embedding(x))

    # embedding shape: (1, N, embedding_size)

    outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))

    # outputs shape: (1, N, hidden_size)

    predictions = self.fc(outputs)

    # predictions shape: (1, N, length_target_vocabulary) to send it to

    # loss function we want it to be (N, length_target_vocabulary) so we're

    # just gonna remove the first dim

    predictions = predictions.squeeze(0)

    return predictions, hidden, cell

class Seq2Seq(nn.Module):

def __init__(self, encoder, decoder):

    super(Seq2Seq, self).__init__()

    self.encoder = encoder

    self.decoder = decoder

def forward(self, source, target, teacher_force_ratio=0.5):

    batch_size = source.shape[1]

    target_len = target.shape[0]

    target_vocab_size = 2; #len(english.vocab)

    outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

    hidden, cell = self.encoder(source)

    # Grab the first input to the Decoder which will be <SOS> token

    x = target[0]

    for t in range(1, target_len):

        # Use previous hidden, cell as context from encoder at start

        output, hidden, cell = self.decoder(x, hidden, cell)

        # Store next output prediction

        outputs[t] = output

        # Get the best word the Decoder predicted (index in the vocabulary)

        best_guess = output.argmax(1)

        # With probability of teacher_force_ratio we take the actual next word

        # otherwise we take the word that the Decoder predicted it to be.

        # Teacher Forcing is used so that the model gets used to seeing

        # similar inputs at training and testing time, if teacher forcing is 1

        # then inputs at test time might be completely different than what the

        # network is used to. This was a long comment.

        x = target[t] if random.random() < teacher_force_ratio else best_guess

    return outputs

We’re ready to define everything we need for training our Seq2Seq model

Training hyperparameters

num_epochs = 2

learning_rate = 0.001

batch_size = 64

Model hyperparameters

load_model = False

device = torch.device(“cuda” if torch.cuda.is_available() else “cpu”)

input_size_encoder = 6000; #Going to let this have a resolution of 3 decimal points for weights & output predictions. len(german.vocab)

input_size_decoder = 2;

output_size = 2;

encoder_embedding_size = 300 #Im pretty sure this is very arbitrary

decoder_embedding_size = 300

hidden_size = 1024 # Needs to be the same for both RNN’s

num_layers = 2

enc_dropout = 0.5

dec_dropout = 0.5

encoder_net = Encoder(

input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout

).to(device)

decoder_net = Decoder(

input_size_decoder,

decoder_embedding_size,

hidden_size,

output_size,

num_layers,

dec_dropout,

).to(device)

model = Seq2Seq(encoder_net, decoder_net).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

criterion = nn.CrossEntropyLoss()

for epoch in range(num_epochs):

print(f"[Epoch {epoch} / {num_epochs}]")

model.eval()

model.train()

for i in range(1):

    # Get input and targets and get to cuda

    inp_data = train_data#batch.src.to(device)

    target = train_targets#batch.trg.to(device)

    output = model(torch.from_numpy(inp_data*1000).to(torch.int64), torch.from_numpy(target*1000).to(torch.int64))

    #print(torch.from_numpy(inp_data).to(torch.int64))

    # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss

    # doesn't take input in that form. For example if we have MNIST we want to have

    # output to be: (N, 10) and targets just (N). Here we can view it in a similar

    # way that we have output_words * batch_size that we want to send in into

    # our cost function, so we need to do some reshapin. While we're at it

    # Let's also remove the start token while we're at it

    output = output[1:].reshape(-1, output.shape[2])

    target = target[1:].reshape(-1)

    optimizer.zero_grad()

    loss = criterion(output, torch.from_numpy(target))

    # Back prop

    loss.backward()

    # Clip to avoid exploding gradient issues, makes sure grads are

    # within a healthy range

    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

    # Gradient descent step

    optimizer.step()

    # # Plot to tensorboard

    print(loss)