Hello,
I am implementing a text classification. For which I need to customize the LSTMCell hence was trying out between LSTM and LSTMCell(performance is not a factor in this case)
Here is my code for the lstmCell implementation:
class RNN(nn.Module):
def __init__(self, vocab_size, embed_size, num_output, hidden_size=64,
num_layers=2, batch_first=True, use_gpu=False, embeddings = None):
'''
:param vocab_size: vocab size
:param embed_size: embedding size
:param num_output: number of output (classes)
:param hidden_size: hidden size of rnn module
:param num_layers: number of layers in rnn module
:param batch_first: batch first option
'''
super(RNN, self).__init__()
# embedding
self.embedding_dim = embed_size
self.encoder = nn.Embedding(vocab_size, embed_size, padding_idx=0)
if embeddings is not None:
self.encoder.weight = nn.Parameter(embeddings)
self.batch_first = batch_first
self.drop_en = nn.Dropout(p=0.8)
self.use_gpu = use_gpu
self.hidden_dim = hidden_size
# rnn module
self.rnn = nn.LSTM(
input_size=embed_size,
hidden_size=hidden_size,
num_layers=num_layers,
dropout=0.0,
batch_first=True,
bidirectional=False
)
self.rnncell = nn.LSTMCell(
input_size=embed_size,
hidden_size=hidden_size
)
self.bn2 = nn.BatchNorm1d(hidden_size)
self.fc = nn.Linear(hidden_size, num_output)
self.hidden = self.init_hidden(128)
def init_hidden(self, batch_size):
if self.use_gpu:
hx = Variable(torch.zeros(batch_size, self.hidden_dim), requires_grad=False).cuda()
cx = Variable(torch.zeros(batch_size, self.hidden_dim), requires_grad=False).cuda()
else:
hx = Variable(torch.zeros(batch_size, self.hidden_dim))
cx = Variable(torch.zeros(batch_size, self.hidden_dim))
return hx, cx
def forward(self, x, seq_lengths):
'''
:param x: (batch, time_step, input_size)
:return: num_output size
'''
hx, cx = self.hidden
x_embed = self.encoder(x)
x_embed = self.drop_en(x_embed)
#x_embed = x_embed.view(x_embed.size(1), x_embed.size(0), -1)
x_embed = x_embed.transpose(0, 1)
#print x_embed.size()
#packed_input = pack_padded_sequence(x_embed, seq_lengths.cpu().numpy(), batch_first=self.batch_first)
#x = x.view(x_embed.size(1), x_embed.size(0), self.embedding_dim)
#x_embed = x_embed.transpose(0, 1)
# r_out shape (batch, time_step, output_size)
# None is for initial hidden state
yhat = []
for j in range(x_embed.size(0)):
input_t = x_embed[j]
#print input_t.size()
hx, cx = self.rnncell(input_t, (hx, cx))
# print hx.size()
yhat.append(hx)
#ht, ct = self.rnn(x_embed, self.hidden)
#print ht.size()
# use mean of outputs
#out_rnn, _ = pad_packed_sequence(packed_output, batch_first=True)
#row_indices = torch.arange(0, x.size(0)).long()
#col_indices = seq_lengths - 1
#if next(self.parameters()).is_cuda:
# row_indices = row_indices.cuda()
# col_indices = col_indices.cuda()
last_tensor = yhat[-1]
#last_tensor = ht[-1]
#print last_tensor.size()
#fc_input = torch.mean(last_tensor, dim=1)
#last_tensor = ht[:-1]
fc_input = self.bn2(last_tensor)
out = self.fc(fc_input)
#print out.size()
return out
The difference in the performances is huge, am I doing something wrong here?
By performance I meant classification accuracy.
Thanks in advance.