I am working on a text classification task and I am only interested in the last output. Since I would prefer to execute in batches, I came up with the following code (after looking into a few references online)
class Encoder(nn.Module): def __init__(self, input_size, encoding_size, hidden_size, output_size, layers, padding_idx): super(Encoder, self).__init__() self.hidden_size = hidden_size self.encoding_size = encoding_size self.layers = layers self.batch_size = batch_size self.embedding = nn.Embedding(input_size, encoding_size, padding_idx=padding_idx) self.e2i = nn.Linear(encoding_size, hidden_size) self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True, num_layers=self.layers) self.out = nn.Linear(hidden_size, output_size) self.sigmoid = nn.Sigmoid() self.batch_first = True def forward(self, X, X_lengths, batch_size): self.hidden = self.initHidden(batch_size) X = self.embedding(X) X = self.e2i(X) X = rnn.pack_padded_sequence(X, X_lengths, batch_first=True) X, self.hidden = self.gru(X, self.hidden) X, _ = torch.nn.utils.rnn.pad_packed_sequence(X, batch_first=True) idx = (torch.cuda.LongTensor(X_lengths) - 1).view(-1, 1).expand(len(X_lengths), X.size(2)) time_dimension = 1 if self.batch_first else 0 idx = idx.unsqueeze(time_dimension) X = X.gather(time_dimension, Variable(idx)).squeeze(time_dimension) X = self.out(X) X = self.sigmoid(X) return X def initHidden(self,batch_size): return torch.zeros(self.layers, batch_size, self.hidden_size).to(device)
However, I observed that the training was very slow. Since my text sequences are long (in terms of number of words) I have decided to perform a truncated backpropagation through time. Could anyone suggest how I can modify the above code to serve my purpose?