hi,
i have trained encoder-decoder model for image captioning and need to test the results for input test data,suppose test data is features
with 512
size torch vector and hidden_size
are also the same.
class DecoderRNN(nn.Module):
def __init__(self, embed_size, hidden_size, vocab_size, num_layers, max_seq_length):
"""Set the hyper-parameters and build the layers."""
super(DecoderRNN, self).__init__()
self.embed = nn.Embedding(vocab_size, embed_size)
self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
self.linear = nn.Linear(hidden_size, vocab_size)
self.max_seg_length = max_seq_length
def forward(self, features, captions, lengths):
"""Decode image feature vectors and generates captions."""
embeddings = self.embed(captions)
embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
packed = pack_padded_sequence(embeddings, lengths, batch_first=True)
hiddens, _ = self.lstm(packed)
outputs = self.linear(hiddens[0])
return outputs
def sample(self, features, states=None):
"""Generate captions for given image features using greedy search."""
sampled_ids = []
inputs = features.unsqueeze(1)
for i in range(self.max_seg_length):
hiddens, states = self.lstm(inputs, states) # hiddens: (batch_size, 1, hidden_size)
outputs = self.linear(hiddens.squeeze(1)) # outputs: (batch_size, vocab_size)
_, predicted = outputs.max(1) # predicted: (batch_size)
sampled_ids.append(predicted)
inputs = self.embed(predicted) # inputs: (batch_size, embed_size)
inputs = inputs.unsqueeze(1) # inputs: (batch_size, 1, embed_size)
sampled_ids = torch.stack(sampled_ids, 1) # sampled_ids: (batch_size, max_seq_length)
return sampled_ids
so here is my input feature for the decoder, please let me know any suggestions for my approach. because I’m getting almost the same ouput sentence for every input;
decoder = DecoderRNN(embed_size, hidden_size, len(voc),num_layers,max_seq_length=20).to(device)
decoder.load_state_dict(torch.load(decoder_path))
img=Variable(image)
img_tensor =img.view(1,512)
img_tensor = img_tensor.to(device)
decoder.eval()
sampled_ids = decoder.sample(img_tensor)
sampled_ids = sampled_ids[0].cpu().numpy()
is this the correct way to generate LSTM output from decoderRNN?