Hi! I was trying to implement pointer generator networks for text summarization
: https://arxiv.org/abs/1704.04368
In order to implement this I built the network on top of the attention model which was provided in the PyTorch Tutorials . The dataset used was just first 30k pairs of this : https://www.kaggle.com/sunnysai12345/news-summary
Here is my Encoder model :
class Encoder(nn.Module):
def __init__(self,input_vocab_size,hidden_size):
super(Encoder,self).__init__()
self.hidden_size = hidden_size
self.input_vocab_size = input_vocab_size
self.embedding_layer = nn.Embedding(self.input_vocab_size,embedding_dim = self.hidden_size)
self.gru_layer = nn.GRU(input_size = self.hidden_size,hidden_size = self.hidden_size)
def forward(self,input,prev_hidden_state):
embedded_outputs = self.embedding_layer(input).view(1,1,-1) #sequence_length X batch_size X inputsize --> here input size is mostly the word embedding matrix or one hot
output,prev_hidden_state = self.gru_layer(embedded_outputs,prev_hidden_state) #output is batch_size times hidden_size
return output,prev_hidden_state
def init_hidden(self):
return torch.zeros(1,1,self.hidden_size,device=device)
And here is my Attention Decoder:
class AttentionDecoder(nn.Module):
def __init__(self,output_vocab_size,hidden_size,max_length_encoder,dropout_value,num_layers=1):
super(AttentionDecoder,self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.output_vocab_size = output_vocab_size
self.dropout_p = dropout_value
self.max_length_encoder = max_length_encoder
self.embedding_layer = nn.Embedding(self.output_vocab_size,self.hidden_size)
self.attention_layer = nn.Linear(self.hidden_size*2,self.max_length_encoder)
self.attention_combine = nn.Linear(self.hidden_size*2,self.hidden_size)
# self.Wsx = nn.Linear(self.hidden_size*2,1)
# self.Wcontext = nn.Linear(self.hidden_size,1)
self.Wsxe = nn.Linear(self.hidden_size*3,1)
self.gru_layer = nn.GRU(self.hidden_size,self.hidden_size)
self.output_layer = nn.Linear(self.hidden_size,self.output_vocab_size)
self.dropout_layer = nn.Dropout(self.dropout_p)
def forward(self,input_,prev_hidden_state,encoder_output):
embedding_outputs = self.embedding_layer(input_).view(1,1,-1)
embeddings_dropout = self.dropout_layer(embedding_outputs)
attention_layer_output = self.attention_layer(torch.cat((embeddings_dropout[0],prev_hidden_state[0]),1))
attention_weights = nn.functional.softmax(attention_layer_output,dim=1)
attention_applied = torch.bmm(attention_weights.unsqueeze(0),encoder_output.unsqueeze(0))
concatenate_vector = torch.cat((embeddings_dropout[0],prev_hidden_state[0],attention_applied[0]),1)
# concat_attention = torch.cat((),1)
concatenate_vector_outputs = self.Wsxe(concatenate_vector)
# context_vector_output = self.Wcontext(attention_applied)
pgen = nn.Sigmoid()
pgen = pgen(concatenate_vector_outputs)
attention_combine_logits = self.attention_combine(torch.cat((embeddings_dropout[0],attention_applied[0]),1)).unsqueeze(0) #since gru requires a batch dimension
attention_combine_relu = nn.functional.relu(attention_combine_logits)
output,hidden = self.gru_layer(attention_combine_relu,prev_hidden_state)
output_logits = self.output_layer(output)
output_softmax = nn.functional.log_softmax(output_logits[0],dim=1)
# print(output_softmax)
return output_softmax,hidden,attention_weights,pgen
def init_hidden(self):
return torch.zeros(1,1,self.hidden_size,device=device)
The part of pointing is done in training part down below :
teacher_forcing_ratio = 0.5
def train(encoder,decoder,encoder_tens,decoder_tens,criterion,encoder_max_length,encoder_optimizer,decoder_optimizer):
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()
encoder_hidden_state = encoder.init_hidden()
input_length = len(encoder_tens)
output_length = len(decoder_tens)
encoder_outputs = torch.zeros(encoder_max_length,encoder.hidden_size,device=device)
loss = 0
for encoder_index in range(input_length):
encoder_output,encoder_hidden_state = encoder(encoder_tens[encoder_index],encoder_hidden_state)
encoder_outputs[encoder_index] = encoder_output[0,0]
decoder_input = torch.tensor([[word2Index_dec['<START>']]],device=device)
decoder_hidden_state = encoder_hidden_state
use_teacher_forcing = True if random.random() > teacher_forcing_ratio else False
encoder_indices = encoder_tens.tolist()
encoder_words = [ind2Word_enc[index[0]] for index in encoder_indices]
common_words = []
for word in encoder_words:
if word in list(word2Index_dec.keys()):
common_words.append(word)
extended_vocab_i2w = {index:word for word,index in word2Index_dec.items()}
last_index = len(extended_vocab_i2w.keys())
for i in range(len(encoder_words)):
if encoder_words[i] not in extended_vocab_i2w.values():
extended_vocab_i2w[last_index] = encoder_words[i]
last_index+=1
extended_vocab_w2i = {word:index for index,word in extended_vocab_i2w.items()}
extended_probab = [0]*len(extended_vocab_w2i.keys())
common_words_indices = [extended_vocab_w2i[word] for word in common_words]
if use_teacher_forcing:
for decoder_index in range(output_length):
decoder_output,decoder_hidden_state,attention_weights,pgen = decoder(decoder_input,decoder_hidden_state,encoder_outputs)
pgen = pgen.tolist()
decoder_output_list = decoder_output.tolist()
attention_mapping = {}
attention_weights_list = attention_weights.tolist()[0]
for i in range(len(encoder_indices)):
encoder_word = encoder_words[i]
extended_vocabulary_index = extended_vocab_w2i[encoder_word]
attention_mapping[extended_vocabulary_index] = attention_weights_list[i]
pgen = pgen[0][0]
for i in range(len(extended_probab)):
if i < len(word2Index_dec.keys()):
if i in common_words_indices:
extended_probab[i] = pgen*np.exp(np.array(decoder_output_list[0][i])) + (1-pgen)*attention_mapping[i]
else:
extended_probab[i] = pgen*np.exp(np.array(decoder_output_list[0][i]))
else:
extended_probab[i] = (1-pgen)*attention_mapping[i]
extended_outputs = torch.tensor(extended_probab,device=device)
loss+=criterion(decoder_output,decoder_tens[decoder_index])
decoder_input = decoder_tens[decoder_index]
else:
for decoder_index in range(output_length):
decoder_output,decoder_hidden_state,attention_weights,pgen = decoder(decoder_input,decoder_hidden_state,encoder_outputs)
pgen = pgen.tolist()
decoder_output_list = decoder_output.tolist()
attention_mapping = {}
attention_weights_list = attention_weights.tolist()[0]
for i in range(len(encoder_indices)):
encoder_word = encoder_words[i]
extended_vocabulary_index = extended_vocab_w2i[encoder_word]
attention_mapping[extended_vocabulary_index] = attention_weights_list[i]
pgen = pgen[0][0]
for i in range(len(extended_probab)):
if i < len(word2Index_dec.keys()):
if i in common_words_indices:
extended_probab[i] = pgen*np.exp(np.array(decoder_output_list[0][i])) + (1-pgen)*attention_mapping[i]
else:
extended_probab[i] = pgen*np.exp(np.array(decoder_output_list[0][i]))
else:
extended_probab[i] = (1-pgen)*attention_mapping[i]
extended_probab[0] = 0
extended_outputs = torch.tensor(extended_probab,device=device)
loss+=criterion(decoder_output,decoder_tens[decoder_index])
topv,topi = extended_outputs.topk(1)
decoder_input = topi.squeeze().detach()
if decoder_input >= len(word2Index_dec.keys()):
decoder_input = torch.tensor(0,dtype=torch.long,device=device)
if decoder_input.item() == word2Index_dec['<END>']:
break
loss.backward()
encoder_optimizer.step()
decoder_optimizer.step()
return loss.item()/output_length
My vocabulary size is 21k for decoder and 47k for encoder.
When I train this model , my learning rate is 0.03 and the optimizer is Adam . It takes large amount of time to train . It starts with a loss of about 7.5 and and took me 7 hours on Google Colab GPU to reduce it to 5.5 . Can someone please help me out?