Hi,
I would like to do image-caption using a customized-LSTM.
My problem is that most of the LSTM model use pack_padded_sequence() function with padding and train with batch_size when processing multiple different length of sentences in a batch.
I was wondering how I input the same parameters instead of using pack_padded_sequence() in customized-LSTM.
This is my model:
class FactoredLSTM(nn.Module):
def __init__(self, emb_dim, hidden_dim, factored_dim, vocab_size, max_len=30):
""" Set the hyper-parameters and build the layers."""
super(FactoredLSTM, self).__init__()
self.hidden_dim = hidden_dim
self.factored_dim = factored_dim
self.vocab_size = vocab_size
self.max_len = max_len
self.embed = nn.Embedding(vocab_size, emb_dim)
self.linear = nn.Linear(hidden_dim, vocab_size)
# factored lstm weights
# U,V,W
self.U_i = nn.Linear(factored_dim, hidden_dim)
self.V_i = nn.Linear(emb_dim, factored_dim)
self.W_i = nn.Linear(hidden_dim, hidden_dim)
self.U_f = nn.Linear(factored_dim, hidden_dim)
self.V_f = nn.Linear(emb_dim, factored_dim)
self.W_f = nn.Linear(hidden_dim, hidden_dim)
self.U_o = nn.Linear(factored_dim, hidden_dim)
self.V_o = nn.Linear(emb_dim, factored_dim)
self.W_o = nn.Linear(hidden_dim, hidden_dim)
self.U_c = nn.Linear(factored_dim, hidden_dim)
self.V_c = nn.Linear(emb_dim, factored_dim)
self.W_c = nn.Linear(hidden_dim, hidden_dim)
# factor matrix
# S
self.S_i = nn.Linear(factored_dim, factored_dim)
self.S_f = nn.Linear(factored_dim, factored_dim)
self.S_o = nn.Linear(factored_dim, factored_dim)
self.S_c = nn.Linear(factored_dim, factored_dim)
def forward_factor(self, embedded, h_0, c_0, mode):
i = self.V_i(embedded)
f = self.V_f(embedded)
o = self.V_o(embedded)
c = self.V_c(embedded)
if mode == "factual":
i = self.S_i(i)
f = self.S_f(f)
o = self.S_o(o)
c = self.S_c(c)
i_t = torch.sigmoid(self.U_i(i) + self.W_i(h_0))
f_t = torch.sigmoid(self.U_f(f) + self.W_f(h_0))
o_t = torch.sigmoid(self.U_o(o) + self.W_o(h_0))
c_tilda = torch.tanh(self.U_c(c) + self.W_c(h_0))
c_t = f_t * c_0 + i_t * c_tilda
h_t = o_t * c_t
hiddens = h_t
return hiddens, h_t, c_t
def forward(self, features, captions, lengths, mode="factual"):
"""
Args:
features: fixed vectors from images, [batch, emb_dim]
captions: [batch, seq_len]
mode: type of caption to generate
"""
embedded = self.embed(captions)
embedded = torch.cat((features.unsqueeze(1), embedded), 1)
packed = pack_padded_sequence(embedded, lengths, batch_first=True)
# initialize hidden state
h_t, c_t = self.init_hidden_states(len(lengths))
outputs = []
# iterate for length of captions
for index in range(embedded.size(1)-1):
emb = embedded[:, index, :]
hiddens, h_t, c_t = self. forward_factor(emb, h_t, c_t, mode=mode)
outs = self.linear(hiddens)
outputs.append(outs)
outputs = torch.stack(outputs, 1)
return outputs
def init_hidden_states(self, batch_size):
hidden_dim = self.hidden_dim
h0 = Variable(torch.zeros(batch_size, hidden_dim)).cuda()
c0 = Variable(torch.zeros(batch_size, hidden_dim)).cuda()
nn.init.uniform_(h0)
nn.init.uniform_(c0)
return h0, c0
I was confused for this part whether I input the correct parameters into customized-LSTM.
As follow:
# iterate for length of captions
for index in range(embedded.size(1)-1):
emb = embedded[:, index, :]
hiddens, h_t, c_t = self. forward_factor(emb, h_t, c_t, mode=mode)
outs = self.linear(hiddens)
outputs.append(outs)
Can anyone give me some advices?
Thanks a lot !!!