I know this problem is quite similar with other question already asked Expected input to torch Embedding layer with pre_trained vectors from gensim , but I do not understand how to actually map a gensim vocab trained with dimensions (8000 x 200) on protein sequences on a dataloader. The dataloader take two ids from a tsv file and map them onto the corresponding fasta sequences in another file, and also takes the coresponding label associated with each pairs of ids.
class Pairs:
def init(self, pairs_file, seq_file):
self.labels = pd.read_csv(pairs_file, sep=‘\t’)
print(self.labels.shape)
self.seq_file = seq_file
print(isinstance(self.seq_file, dict))
def __len__(self): return len(self.seq_file)
def __getitem__(self, item):
for key, value in self.seq_file.items():
if key == self.labels.iloc[item, 0]:
protein1 = key
elif key == self.labels.iloc[item, 1]:
protein2 = key
label = self.labels.iloc[item, 2]
return {
"protein1": torch.tensor(protein1, dtype=torch.long),
"protein2": torch.tensor(protein2, dtype=torch.long),
"label": torch.tensor(label, dtype=torch.long)
}I built a simple lstm model which contains an embedding layer, where vocab is a tensor obtained from the keyedvectors of gensim model.
class PLM(nn.Module):
definit(self):
super(PLM, self).init()
self.word_embedding = nn.Embedding.from_pretrained(vocab)
self.lstm = nn.LSTM(500, 300, 1)
def forward(self, x):
print(x.shape)
text_emb = self.word_embeddings(x)
print("Tesx shape:", text_emb.shape)
lstm_out, lstm_hidden = self.lstm(text_emb)
lstm_out = lstm_out[:,-1,:]
print("lstm_out shape:", lstm_out.shape)
output = self.dense(drop_out)
return output
PLM() model = PLM() criterion = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(), lr=0.01) def train(data_loader,model,optimizer,device): “”" This is the main training function that trains model for one epoch :param data_loader: this is the torchdataloader :param model: model(lstm model) :param optimizer: optimizer Adam, SGD etc :param device: this can be “cuda” or “cpu” “”" # set the model to training mode model.train()
# go through the batches of data in data_loader:
for data in data_loader:
protein1 = data["protein1"]
protein2 = data["protein2"]
label = data["label"]
protein1 = protein1.to(device,dtype=torch.long)
protein2 = protein2.to(device,dtype=torch.long)
label = label.to(device, dtype=torch.float)
optimizer.zero_grad()
predictions = model(protein1, protein2)
loss = nn.MSELoss()(predictions,label.view(-1,1))
loss.backward()
optimizer.step()
train(pairs, model=model, optimizer=optimizer, device=‘cpu’)
Traceback (most recent call last):
File “/hdda/mihai/LLM/PPI_Project/dscript-data/info/./lstm.py”, line 69, in
train(pairs, model=model, optimizer=optimizer, device=‘cpu’)
~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File “/hdda/mihai/LLM/PPI_Project/dscript-data/info/./lstm.py”, line 56, in train
for data in data_loader:
^^^^^^^^^^^
File “/hdda/mihai/LLM/PPI_Project/dscript-data/info/preprocessing.py”, line 47, ingetitem
“protein1”: torch.tensor(protein1, dtype=torch.long),
~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: new(): invalid data type ‘str’
Should I transform my sequences protein1 protein2 into numerical vectors, or can I integrate the result from gensim model into the dataloader? I know from a previous discussion that I should also have a word2index dictionary, but I assume this is already the case using gensim, but I do not know how to actually extract the dictionary and use it for my model, I tried to get the indices with:
keys = model.wv for key in keys: idx = keys.get_index[key]
but I get:
Traceback (most recent call last):
File “/hdda/mihai/LLM/PPI_Project/dscript-data/info/./trigram.py”, line 75, in
idx = keys.get_index[key]
~~~~~~~~~~~~~~^^^^^
TypeError: ‘method’ object is not subscriptable