hi, i am new to pytorch and i am trying to build a model that takes a drug sequence and a protein sequence and tries to classify their interaction type from 3 classes.
after building the model and training it, i noticed that its not learning any thing and the loss value is the same throughout the training because it classifies all samples into one class ignoring the other classes
for the data i unify the length of all drug to the length of the longest and same for proteins, then after colleting all the characters used in both drugs and protein separately, i covert all the letters in a sequence to their respective number from their respective dictionary (starts indexing letters from 1 and so on) and kept 0 value as a padding value that used to unify the lengths of sequences.
if i am doing anything wrong please guide me and sorry for the messy code
data structure is like
#protein:‘MSTESMIRDVELAEEALPKKTGGPQGSRRCLFLSLFSFLIVAGATTLFCLLHFGVIGPQREEFPRDLSLISPLAQAVRSSSRTPSDKPVAHVVANPQAEGQLQWLNRRANALLANGVELRDNQLVVPSEGLYLIYSQVLFKGQGCPSTHVLLTHTISRIAVSYQTKVNLLSAIKSPCQRETPEGAEAKPWYEPIYLGGVFQLEKGDRLSAEINRPDYLDFAESGQVYFGIIAL’
#drug:‘CCOC(=O)CCCOc1nc(Cl)nc2cc(OC)c(OC)cc12’
#clss:‘3.’
##code##
#-----------------------------------------longest element function-------------------------------------
def longest_element (data_column):
max = 0
for row in data_column.values:
if(len(row.item())>max):
max = len(row.item())
return max
#-----------------------------------------vocab generation function-------------------------------------
def get_vocab(data_col):
#return all characters used in a column as a list
#create empty list
vocab = ''
#remove redumdant rows
if(type(data_col) == 'pandas.core.series.Series'):
data = data_col.unique()
else:
data = data_col
#loop all rows
for row in data.values:
#loop all chars in a single row
for char in row.item():
#add char to covab if its not there
if char in vocab:
continue
else:
vocab = vocab + char
#return vocab
return vocab
#-----------------------------------------word to ohe mat function-------------------------------------
def word2mat(word, vocab):
mat = torch.zeros(len(vocab), len(word)) #[num_word_letters, 0, num_tot_letters]
i=0
for letter in word:
mat[vocab.find(letter), i] = 1
i = i + 1
return mat
#-------------------------------------------sequence to numbers-------------------------------------------------
def seq2idx(seq, vocab, indexer, uniform_length, n_samples=1):
if(n_samples==1):
num_seq = torch.zeros(n_samples, uniform_length, dtype=torch.int32)
for c1,line in enumerate(seq):
num_seq[n_samples-1, c1] = indexer[line]
else:
#n_samples = len(seq)
num_seq = torch.zeros(n_samples, uniform_length, dtype=torch.int32)
for c1,line in enumerate(seq):
for c2,word in enumerate(line):
num_seq[c1, c2] = indexer[word]
return num_seq
#-----------------------------------------model------------------------------------------------------
class classifier(nn.Module):
def __init__(self, drug_num_words, prot_num_words, len_drug, len_prot,
drug_emb_dim=128, drug_gru_dim=64, drug_num_gru_layers=32,
prot_emb_dim=128, prot_gru_dim=64, prot_num_gru_layers=32,
out_shape=512, hidden_out=512, num_cls=3, batch_size=1, dvc='cpu'):
super(classifier, self).__init__()
#drug vars
self.drug_emb_dim = drug_emb_dim
self.drug_gru_dim = drug_gru_dim
self.drug_num_gru_layers = drug_num_gru_layers
#prot vars
self.prot_emb_dim = prot_emb_dim
self.prot_gru_dim = prot_gru_dim
self.prot_num_gru_layers = prot_num_gru_layers
#global vars
self.out_shape = out_shape
self.hidden_out = hidden_out
self.batch_size = batch_size
self.dvc = dvc
self.num_cls = num_cls
#drug side
self.drug_encoder = nn.Embedding(drug_num_words+1, drug_emb_dim)
self.drug_gru = nn.GRU(drug_emb_dim, drug_gru_dim, num_layers=drug_num_gru_layers, batch_first=True, dropout=0.2)
self.drug_decoder = nn.Linear(drug_gru_dim * drug_num_gru_layers, out_shape)
#prot side
self.prot_encoder = nn.Embedding(prot_num_words+1, prot_emb_dim)
self.prot_gru = nn.GRU(prot_emb_dim, prot_gru_dim, num_layers=prot_num_gru_layers, batch_first=True, dropout=0.2)
self.prot_decoder = nn.Linear(prot_gru_dim * prot_num_gru_layers, out_shape)
#collector side
#self.decoder = nn.Linear(out_shape*2, hidden_out)
self.decoder = nn.Linear(self.drug_num_gru_layers*self.drug_gru_dim + self.prot_num_gru_layers*self.prot_gru_dim, hidden_out)
self.relu = nn.ReLU()
self.converter = nn.Linear(hidden_out, 512)
self.converter2 = nn.Linear(512, 256)
self.converter3 = nn.Linear(256, num_cls)
def init_hidden(self, num_gru_layers, gru_dim):
return torch.zeros(num_gru_layers, self.batch_size, gru_dim).to()
def forward(self, drug_vec, prot_vec):
#re-assign batch size
batch_size = drug_vec.size(0)
if batch_size != self.batch_size:
self.batch_size = batch_size
#drug part
drug_encoded = self.drug_encoder(drug_vec)
drug_out, drug_hidden = self.drug_gru(drug_encoded, self.init_hidden(self.drug_num_gru_layers, self.drug_gru_dim))
#drug_out = self.drug_decoder(drug_out[:,:,-1].squeeze())
#drug_out = self.drug_decoder(drug_out.reshape(batch_size,-1))
drug_out = drug_hidden.transpose(0,1)
drug_out = drug_out.reshape(drug_out.shape[0], -1)
#prot part
prot_encoded = self.prot_encoder(prot_vec)
prot_out, prot_hidden = self.prot_gru(prot_encoded, self.init_hidden(self.prot_num_gru_layers, self.prot_gru_dim))
#prot_out = self.prot_decoder(prot_out[:,:,-1].squeeze())
#prot_out = self.prot_decoder(prot_out.reshape(batch_size,-1))
prot_out = prot_hidden.transpose(0,1)
prot_out = prot_out.reshape(prot_out.shape[0], -1)
#combined part
out = torch.cat((drug_out, prot_out),dim=1)
out = self.relu(self.decoder(out))
out = self.relu(self.converter(out))
out = self.relu(self.converter2(out))
out = self.converter3(out)
return out #, hidden
dataset_path = ‘D:/python/work/dataset/final_dataset/daten.csv’
#read data
dataset = pd.read_csv(dataset_path)
drug_vocab = get_vocab(dataset[[‘drug_smile’]])
prot_vocab = get_vocab(dataset[[‘sequence’]])
drug2idx = {val:c+1 for c,val in enumerate(drug_vocab)}
prot2idx = {val:c+1 for c,val in enumerate(prot_vocab)}
longest_drug = longest_element(dataset[[‘drug_smile’]])
longest_prot = longest_element(dataset[[‘sequence’]])
dvc = ‘cuda’ if torch.cuda.is_available() else ‘cpu’
num_epochs = 10
all_classes = torch.tensor([[0,0,0],[1,0,0],[0,1,0],[0,0,1]],dtype=torch.float32)
model = classifier(len(drug_vocab), len(prot_vocab), longest_drug, longest_prot, dvc=dvc)
model = model.to(dvc)
criterion = nn.CrossEntropyLoss()
optim = torch.optim.SGD([p for p in model.parameters() if p.requires_grad], lr = 0.001)
#tain loop-------------------------------------------------------------------------------------------
model.train()
train_losses = []
print(‘starting to learn . . .’)
for epoch in range(num_epochs):
losses = []
total = 0
for i in range(len(daten)):
drug = seq2idx(daten[‘drug_smile’][i], drug_vocab, drug2idx, longest_drug).to(dvc)
prot = seq2idx(daten[‘sequence’][i], prot_vocab, prot2idx, longest_prot).to(dvc)
clss = all_classes[int(daten[‘clss’][i])].to(dvc)
clss = clss.unsqueeze(0)
model.zero_grad()
y_pred = model(drug, prot)
loss = criterion(y_pred, clss)
loss.backward()
#nn.utils.clip_grad_norm_(model.parameters(), 3) #opt
optim.step()
losses.append(loss.item())
total+=1
if((i+1)%5 == 0):
print(f'epoch {epoch+1}/{num_epochs} batch {i+1}/{len(daten)} batch loss {loss.item()} clss {(np.argmax(y_pred.detach().numpy())+1).item()}/{(np.argmax(clss)+1).item()}')
epoch_loss = sum(losses)/total
train_losses.append(epoch_loss)
print(f'epoch{epoch+1} loss value is {epoch_loss}')