I am trying to add two dense layer over single GRU to build a joint classification of 3labels and 9labels for a single text. I added losses from both.
Am I doing it right ? Do we need two optimizers? Or any further improvement which can be done?
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import numpy as np
path="/content/"
is_cuda = torch.cuda.is_available()
# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
device = torch.device("cuda")
else:
device = torch.device("cpu")
import pickle
with open(path + "word2index.pickle",'rb') as out:
word2index = pickle.load(out)
with open(path + "train_data.pickle",'rb') as out:
train = pickle.load(out)
with open(path + "val_data.pickle",'rb') as out:
val = pickle.load(out)
with open(path + "test_data.pickle",'rb') as out:
test = pickle.load(out)
with open(path + "word2index.pickle",'rb') as out:
word2index = pickle.load(out)
with open(path + "dic.pickle",'rb') as out:
model=pickle.load(out)
embedding_matrix = np.zeros((len(word2index) + 1, 300)) #creating glove embedding matrix
for k,v in model.items():
embedding_matrix[k]=v
weight = torch.FloatTensor(embedding_matrix)#customized glove embeddings
embedding = nn.Embedding.from_pretrained(weight)#loading glove weights
MAXLEN=0
def sequence(word2index,mode): #returns index for words and its y label
sequence=[]
y_seq=[]
y_seq2=[]
for s in mode:
text = s.split('__split__')[0]
y = s.split('__split__')[1]
y2 = s.split('__split__')[2]
one_seq=[]
words= text.split(' ')
global MAXLEN
if(len(words)>MAXLEN):
MAXLEN= len(words)
for w in words:
if (w in word2index.keys()):
one_seq.append(word2index[w])
else:
one_seq.append(1)
sequence.append(one_seq)
y_seq.append(y)
y_seq2.append(y2)
return sequence,y_seq,y_seq2
sequence,y_seq,y_seq2 = sequence(word2index,train)
label_seq = list(sorted(set(y_seq)))#sorting labels to give it indices
def get_label_id(label):
return label_seq.index(label)
def countries2tensor(countries):#convert labels in indices
country_ids = [get_label_id(
country) for country in countries]
return torch.LongTensor(country_ids)
labels1 = countries2tensor(y_seq).to(device)
label_seq2 = list(sorted(set(y_seq2)))#sorting labels to give it indices
def get_label_id(label):
return label_seq2.index(label)
def countries2tensor(countries):#convert labels in indices
country_ids = [get_label_id(
country) for country in countries]
return torch.LongTensor(country_ids)
labels2 = countries2tensor(y_seq2).to(device)
from keras.preprocessing.sequence import pad_sequences
X = pad_sequences(sequence, maxlen=MAXLEN,padding='post',truncating='post')
input =torch.LongTensor(X)
embeded_train = embedding(input).to(device) #creates input embedding
batch_size = 32
dataset = TensorDataset(embeded_train, labels1,labels2)
loader = DataLoader(dataset, batch_size=batch_size)
num_classes1 = 9
num_classes2 = 3
num_epochs = 200
learning_rate = 0.001
input_size = embeded_train.shape[2]
sequence_length = embeded_train.shape[1]
hidden_size = 128
num_layers = 5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, num_layers):
super(RNN, self).__init__()
self.num_layers = num_layers
self.hidden_size = hidden_size
#self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
#self.fc = nn.Linear(hidden_size, num_classes)
self.gru = nn.GRU(input_size,hidden_size, num_layers, batch_first=True)
self.fc1 = nn.Linear(hidden_size, num_classes1)
self.fc2 = nn.Linear(hidden_size, num_classes2)
def forward(self, x):
#h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
if(torch.cuda.is_available()):
out, _ = self.gru(x)
out = out[:, -1, :].to(device)
out1 = self.fc1(out)
out2 = self.fc2(out)
return out1,out2
model = RNN(input_size, hidden_size, num_layers).to(device)
criterion1 = nn.CrossEntropyLoss()
criterion2 = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
model = model.to(device)
criterion1 = criterion1.to(device)
criterion2 = criterion2.to(device)
batch_idx=0
for epoch in range(num_epochs):
batch_idx=batch_idx+1
train_loss = 0.
for x,y,y2 in loader:
optimizer.zero_grad()
output1,output2 = model(x)
loss1 = criterion1(output1,y)
loss2 = criterion2(output2,y2)
loss= (loss1 + loss2)
loss.backward()
optimizer.step()
train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.data - train_loss))
print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, train_loss))