Model is not learning

hi, i am new to pytorch and i am trying to build a model that takes a drug sequence and a protein sequence and tries to classify their interaction type from 3 classes.
after building the model and training it, i noticed that its not learning any thing and the loss value is the same throughout the training because it classifies all samples into one class ignoring the other classes
for the data i unify the length of all drug to the length of the longest and same for proteins, then after colleting all the characters used in both drugs and protein separately, i covert all the letters in a sequence to their respective number from their respective dictionary (starts indexing letters from 1 and so on) and kept 0 value as a padding value that used to unify the lengths of sequences.
if i am doing anything wrong please guide me and sorry for the messy code
data structure is like
#protein:‘MSTESMIRDVELAEEALPKKTGGPQGSRRCLFLSLFSFLIVAGATTLFCLLHFGVIGPQREEFPRDLSLISPLAQAVRSSSRTPSDKPVAHVVANPQAEGQLQWLNRRANALLANGVELRDNQLVVPSEGLYLIYSQVLFKGQGCPSTHVLLTHTISRIAVSYQTKVNLLSAIKSPCQRETPEGAEAKPWYEPIYLGGVFQLEKGDRLSAEINRPDYLDFAESGQVYFGIIAL’
#drug:‘CCOC(=O)CCCOc1nc(Cl)nc2cc(OC)c(OC)cc12’
#clss:‘3.’
##code##
#-----------------------------------------longest element function-------------------------------------
def longest_element (data_column):
max = 0
for row in data_column.values:
if(len(row.item())>max):
max = len(row.item())

return max

#-----------------------------------------vocab generation function-------------------------------------
def get_vocab(data_col):
#return all characters used in a column as a list

#create empty list
vocab = ''

#remove redumdant rows
if(type(data_col) == 'pandas.core.series.Series'):
    data = data_col.unique()
else:
    data = data_col

#loop all rows
for row in data.values:
    #loop all chars in a single row
    for char in row.item():
        #add char to covab if its not there
        if char in vocab:
            continue
        else:
            vocab = vocab + char

#return vocab
return vocab

#-----------------------------------------word to ohe mat function-------------------------------------
def word2mat(word, vocab):
mat = torch.zeros(len(vocab), len(word)) #[num_word_letters, 0, num_tot_letters]
i=0
for letter in word:
mat[vocab.find(letter), i] = 1
i = i + 1
return mat

#-------------------------------------------sequence to numbers-------------------------------------------------
def seq2idx(seq, vocab, indexer, uniform_length, n_samples=1):

if(n_samples==1):
    num_seq = torch.zeros(n_samples, uniform_length, dtype=torch.int32)
    for c1,line in enumerate(seq):
        num_seq[n_samples-1, c1] = indexer[line]
else:
    #n_samples = len(seq)
    num_seq = torch.zeros(n_samples, uniform_length, dtype=torch.int32)
    for c1,line in enumerate(seq):
        for c2,word in enumerate(line):
            num_seq[c1, c2] = indexer[word]
return num_seq

#-----------------------------------------model------------------------------------------------------
class classifier(nn.Module):

def __init__(self, drug_num_words, prot_num_words, len_drug, len_prot,
             drug_emb_dim=128, drug_gru_dim=64, drug_num_gru_layers=32,
             prot_emb_dim=128, prot_gru_dim=64, prot_num_gru_layers=32,
             out_shape=512, hidden_out=512, num_cls=3, batch_size=1, dvc='cpu'):
    super(classifier, self).__init__()
    #drug vars
    self.drug_emb_dim = drug_emb_dim
    self.drug_gru_dim = drug_gru_dim
    self.drug_num_gru_layers = drug_num_gru_layers
    #prot vars
    self.prot_emb_dim = prot_emb_dim
    self.prot_gru_dim = prot_gru_dim
    self.prot_num_gru_layers = prot_num_gru_layers
    #global vars
    self.out_shape = out_shape
    self.hidden_out = hidden_out
    self.batch_size = batch_size
    self.dvc = dvc
    self.num_cls = num_cls
    #drug side
    self.drug_encoder = nn.Embedding(drug_num_words+1, drug_emb_dim)
    self.drug_gru = nn.GRU(drug_emb_dim, drug_gru_dim, num_layers=drug_num_gru_layers, batch_first=True, dropout=0.2)
    self.drug_decoder = nn.Linear(drug_gru_dim * drug_num_gru_layers, out_shape)
    #prot side
    self.prot_encoder = nn.Embedding(prot_num_words+1, prot_emb_dim)
    self.prot_gru = nn.GRU(prot_emb_dim, prot_gru_dim, num_layers=prot_num_gru_layers, batch_first=True, dropout=0.2)
    self.prot_decoder = nn.Linear(prot_gru_dim * prot_num_gru_layers, out_shape)
    #collector side
    #self.decoder = nn.Linear(out_shape*2, hidden_out)
    self.decoder = nn.Linear(self.drug_num_gru_layers*self.drug_gru_dim + self.prot_num_gru_layers*self.prot_gru_dim, hidden_out)
    self.relu = nn.ReLU()
    self.converter = nn.Linear(hidden_out, 512)
    self.converter2 = nn.Linear(512, 256)
    self.converter3 = nn.Linear(256, num_cls)
    
def init_hidden(self, num_gru_layers, gru_dim):
    return torch.zeros(num_gru_layers, self.batch_size, gru_dim).to()

def forward(self, drug_vec, prot_vec):
    #re-assign batch size
    batch_size = drug_vec.size(0)
    if batch_size != self.batch_size:
        self.batch_size = batch_size
    #drug part
    drug_encoded = self.drug_encoder(drug_vec)
    drug_out, drug_hidden = self.drug_gru(drug_encoded, self.init_hidden(self.drug_num_gru_layers, self.drug_gru_dim))
    #drug_out = self.drug_decoder(drug_out[:,:,-1].squeeze())
    #drug_out = self.drug_decoder(drug_out.reshape(batch_size,-1))
    drug_out = drug_hidden.transpose(0,1)
    drug_out = drug_out.reshape(drug_out.shape[0], -1)
    #prot part
    prot_encoded = self.prot_encoder(prot_vec)
    prot_out, prot_hidden = self.prot_gru(prot_encoded, self.init_hidden(self.prot_num_gru_layers, self.prot_gru_dim))
    #prot_out = self.prot_decoder(prot_out[:,:,-1].squeeze())
    #prot_out = self.prot_decoder(prot_out.reshape(batch_size,-1))
    prot_out = prot_hidden.transpose(0,1)
    prot_out = prot_out.reshape(prot_out.shape[0], -1)
    #combined part
    out = torch.cat((drug_out, prot_out),dim=1)
    out = self.relu(self.decoder(out))
    out = self.relu(self.converter(out))
    out = self.relu(self.converter2(out))
    out = self.converter3(out)
    
    return out  #, hidden

dataset_path = ‘D:/python/work/dataset/final_dataset/daten.csv’
#read data
dataset = pd.read_csv(dataset_path)
drug_vocab = get_vocab(dataset[[‘drug_smile’]])
prot_vocab = get_vocab(dataset[[‘sequence’]])
drug2idx = {val:c+1 for c,val in enumerate(drug_vocab)}
prot2idx = {val:c+1 for c,val in enumerate(prot_vocab)}
longest_drug = longest_element(dataset[[‘drug_smile’]])
longest_prot = longest_element(dataset[[‘sequence’]])

dvc = ‘cuda’ if torch.cuda.is_available() else ‘cpu’
num_epochs = 10
all_classes = torch.tensor([[0,0,0],[1,0,0],[0,1,0],[0,0,1]],dtype=torch.float32)

model = classifier(len(drug_vocab), len(prot_vocab), longest_drug, longest_prot, dvc=dvc)
model = model.to(dvc)
criterion = nn.CrossEntropyLoss()
optim = torch.optim.SGD([p for p in model.parameters() if p.requires_grad], lr = 0.001)
#tain loop-------------------------------------------------------------------------------------------
model.train()
train_losses = []
print(‘starting to learn . . .’)
for epoch in range(num_epochs):
losses = []
total = 0
for i in range(len(daten)):
drug = seq2idx(daten[‘drug_smile’][i], drug_vocab, drug2idx, longest_drug).to(dvc)
prot = seq2idx(daten[‘sequence’][i], prot_vocab, prot2idx, longest_prot).to(dvc)
clss = all_classes[int(daten[‘clss’][i])].to(dvc)
clss = clss.unsqueeze(0)

    model.zero_grad()
    y_pred = model(drug, prot)
    loss = criterion(y_pred, clss)
    loss.backward()
    #nn.utils.clip_grad_norm_(model.parameters(), 3) #opt
    optim.step()
    
    losses.append(loss.item())
    total+=1
    
    if((i+1)%5 == 0):
        print(f'epoch {epoch+1}/{num_epochs} batch {i+1}/{len(daten)} batch loss {loss.item()} clss {(np.argmax(y_pred.detach().numpy())+1).item()}/{(np.argmax(clss)+1).item()}')

epoch_loss = sum(losses)/total
train_losses.append(epoch_loss)
print(f'epoch{epoch+1} loss value is {epoch_loss}')