Am i doing it correctly? Two dense layer over single GRU(MULTI LOSS)

I am trying to add two dense layer over single GRU to build a joint classification of 3labels and 9labels for a single text. I added losses from both.
Am I doing it right ? Do we need two optimizers? Or any further improvement which can be done?

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import numpy as np

path="/content/"
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
    
import pickle

with open(path + "word2index.pickle",'rb') as out:
    word2index = pickle.load(out)


with open(path + "train_data.pickle",'rb') as out:
    train = pickle.load(out)

with open(path + "val_data.pickle",'rb') as out:
    val = pickle.load(out)

with open(path + "test_data.pickle",'rb') as out:
    test = pickle.load(out)

with open(path + "word2index.pickle",'rb') as out:
    word2index = pickle.load(out)

with open(path + "dic.pickle",'rb') as out:
    model=pickle.load(out)

embedding_matrix = np.zeros((len(word2index) + 1, 300)) #creating glove embedding matrix
for k,v in model.items():
    embedding_matrix[k]=v

weight = torch.FloatTensor(embedding_matrix)#customized glove embeddings

embedding = nn.Embedding.from_pretrained(weight)#loading glove weights

MAXLEN=0
def sequence(word2index,mode):   #returns index for words and its y label
  sequence=[]
  y_seq=[]
  y_seq2=[]
  for s in mode:
    text = s.split('__split__')[0]
    y = s.split('__split__')[1]
    y2 = s.split('__split__')[2]
    one_seq=[]
    words= text.split(' ')
    global MAXLEN
    if(len(words)>MAXLEN):
      MAXLEN= len(words)

    for w in words:
      if (w in word2index.keys()):
        one_seq.append(word2index[w])
      else:
        one_seq.append(1)
    sequence.append(one_seq)
    y_seq.append(y)
    y_seq2.append(y2)
  return sequence,y_seq,y_seq2

sequence,y_seq,y_seq2 = sequence(word2index,train)

label_seq = list(sorted(set(y_seq)))#sorting labels to give it indices
def get_label_id(label):
        return label_seq.index(label)
def countries2tensor(countries):#convert labels in indices
    country_ids = [get_label_id(
        country) for country in countries]
    return torch.LongTensor(country_ids)
labels1 = countries2tensor(y_seq).to(device)

label_seq2 = list(sorted(set(y_seq2)))#sorting labels to give it indices
def get_label_id(label):
        return label_seq2.index(label)
def countries2tensor(countries):#convert labels in indices
    country_ids = [get_label_id(
        country) for country in countries]
    return torch.LongTensor(country_ids)
labels2 = countries2tensor(y_seq2).to(device)

from keras.preprocessing.sequence import pad_sequences
X = pad_sequences(sequence, maxlen=MAXLEN,padding='post',truncating='post')
input =torch.LongTensor(X)
embeded_train = embedding(input).to(device) #creates input embedding

batch_size = 32
dataset = TensorDataset(embeded_train, labels1,labels2)
loader = DataLoader(dataset, batch_size=batch_size)

num_classes1 = 9
num_classes2 = 3
num_epochs = 200

learning_rate = 0.001

input_size = embeded_train.shape[2]
sequence_length = embeded_train.shape[1]
hidden_size = 128
num_layers = 5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(RNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        #self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        #self.fc = nn.Linear(hidden_size, num_classes)
        self.gru = nn.GRU(input_size,hidden_size, num_layers, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, num_classes1)
        self.fc2 = nn.Linear(hidden_size, num_classes2)
        
    def forward(self, x):
        #h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 
        if(torch.cuda.is_available()):
          out, _ = self.gru(x)
          out = out[:, -1, :].to(device)
          out1 = self.fc1(out)
          out2 = self.fc2(out)
        return out1,out2

model = RNN(input_size, hidden_size, num_layers).to(device)
criterion1 = nn.CrossEntropyLoss()
criterion2 = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

model = model.to(device)
criterion1 = criterion1.to(device)
criterion2 = criterion2.to(device)

batch_idx=0
for epoch in range(num_epochs):
  batch_idx=batch_idx+1
  train_loss = 0.
  for x,y,y2 in loader:
    optimizer.zero_grad()
    output1,output2 = model(x)
    loss1 = criterion1(output1,y)
    loss2 = criterion2(output2,y2)
    loss= (loss1 + loss2)
    loss.backward()
    
    optimizer.step()
    train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.data - train_loss))

  print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, train_loss))

Yeah! The implementation looks correct to me.
Also, since both of your output is using non weighted cross-entropy loss, you can even use one nn.CrossEntropyLoss() as a criterion for both of them.

batch_size = 10
nb_classes = 4
criterion = nn.CrossEntropyLoss()
a = torch.randn(batch_size, nb_classes)
b = torch.randn(batch_size, nb_classes)
target = torch.empty(batch_size, dtype=torch.long).random_(nb_classes)
criterion(a, target)
# tensor(1.5086)
criterion(b, target)
# tensor(1.4570)
1 Like