How to use a gensim vocabulary and a pytorch.dataloader for an lstm model?

I know this problem is quite similar with other question already asked Expected input to torch Embedding layer with pre_trained vectors from gensim , but I do not understand how to actually map a gensim vocab trained with dimensions (8000 x 200) on protein sequences on a dataloader. The dataloader take two ids from a tsv file and map them onto the corresponding fasta sequences in another file, and also takes the coresponding label associated with each pairs of ids.

class Pairs:
def init(self, pairs_file, seq_file):
self.labels = pd.read_csv(pairs_file, sep=‘\t’)
print(self.labels.shape)
self.seq_file = seq_file
print(isinstance(self.seq_file, dict))

def __len__(self): return len(self.seq_file)

def __getitem__(self, item):
for key, value in self.seq_file.items():
if key == self.labels.iloc[item, 0]:
protein1 = key
elif key == self.labels.iloc[item, 1]:
protein2 = key
label = self.labels.iloc[item, 2]
return {
"protein1": torch.tensor(protein1, dtype=torch.long),
"protein2": torch.tensor(protein2, dtype=torch.long),
"label": torch.tensor(label, dtype=torch.long)
}

I built a simple lstm model which contains an embedding layer, where vocab is a tensor obtained from the keyedvectors of gensim model.

class PLM(nn.Module):
def init(self):
super(PLM, self).init()
self.word_embedding = nn.Embedding.from_pretrained(vocab)
self.lstm = nn.LSTM(500, 300, 1)

def forward(self, x):
	print(x.shape)
	text_emb = self.word_embeddings(x)
	print("Tesx shape:", text_emb.shape)
	lstm_out, lstm_hidden = self.lstm(text_emb)
	lstm_out = lstm_out[:,-1,:]
	print("lstm_out shape:", lstm_out.shape)
	output = self.dense(drop_out)
	return output
PLM()

model = PLM()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

def train(data_loader,model,optimizer,device):
“”"
This is the main training function that trains model
for one epoch
:param data_loader: this is the torchdataloader
:param model: model(lstm model)
:param optimizer: optimizer Adam, SGD etc
:param device: this can be “cuda” or “cpu”
“”"
# set the model to training mode
model.train()
# go through the batches of data in data_loader:
for data in data_loader:
    protein1 = data["protein1"]
    protein2 = data["protein2"]
    label = data["label"]
    protein1 = protein1.to(device,dtype=torch.long)
    protein2 = protein2.to(device,dtype=torch.long)
    label = label.to(device, dtype=torch.float)
    optimizer.zero_grad()
    predictions = model(protein1, protein2)
    loss = nn.MSELoss()(predictions,label.view(-1,1))
    loss.backward()
    optimizer.step()

train(pairs, model=model, optimizer=optimizer, device=‘cpu’)

Traceback (most recent call last):
File “/hdda/mihai/LLM/PPI_Project/dscript-data/info/./lstm.py”, line 69, in
train(pairs, model=model, optimizer=optimizer, device=‘cpu’)
~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File “/hdda/mihai/LLM/PPI_Project/dscript-data/info/./lstm.py”, line 56, in train
for data in data_loader:
^^^^^^^^^^^
File “/hdda/mihai/LLM/PPI_Project/dscript-data/info/preprocessing.py”, line 47, in getitem
“protein1”: torch.tensor(protein1, dtype=torch.long),
~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: new(): invalid data type ‘str’

Should I transform my sequences protein1 protein2 into numerical vectors, or can I integrate the result from gensim model into the dataloader? I know from a previous discussion that I should also have a word2index dictionary, but I assume this is already the case using gensim, but I do not know how to actually extract the dictionary and use it for my model, I tried to get the indices with:

keys = model.wv
for key in keys:

idx = keys.get_index[key] 

but I get:

Traceback (most recent call last):
File “/hdda/mihai/LLM/PPI_Project/dscript-data/info/./trigram.py”, line 75, in
idx = keys.get_index[key]
~~~~~~~~~~~~~~^^^^^
TypeError: ‘method’ object is not subscriptable