I am working on a text classification task and I am only interested in the last output. Since I would prefer to execute in batches, I came up with the following code (after looking into a few references online)
class Encoder(nn.Module):
def __init__(self, input_size, encoding_size, hidden_size, output_size, layers, padding_idx):
super(Encoder, self).__init__()
self.hidden_size = hidden_size
self.encoding_size = encoding_size
self.layers = layers
self.batch_size = batch_size
self.embedding = nn.Embedding(input_size, encoding_size, padding_idx=padding_idx)
self.e2i = nn.Linear(encoding_size, hidden_size)
self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True, num_layers=self.layers)
self.out = nn.Linear(hidden_size, output_size)
self.sigmoid = nn.Sigmoid()
self.batch_first = True
def forward(self, X, X_lengths, batch_size):
self.hidden = self.initHidden(batch_size)
X = self.embedding(X)
X = self.e2i(X)
X = rnn.pack_padded_sequence(X, X_lengths, batch_first=True)
X, self.hidden = self.gru(X, self.hidden)
X, _ = torch.nn.utils.rnn.pad_packed_sequence(X, batch_first=True)
idx = (torch.cuda.LongTensor(X_lengths) - 1).view(-1, 1).expand(len(X_lengths), X.size(2))
time_dimension = 1 if self.batch_first else 0
idx = idx.unsqueeze(time_dimension)
X = X.gather(time_dimension, Variable(idx)).squeeze(time_dimension)
X = self.out(X)
X = self.sigmoid(X)
return X
def initHidden(self,batch_size):
return torch.zeros(self.layers, batch_size, self.hidden_size).to(device)
However, I observed that the training was very slow. Since my text sequences are long (in terms of number of words) I have decided to perform a truncated backpropagation through time. Could anyone suggest how I can modify the above code to serve my purpose?