I have implemented the 'same; model two ways. I’m applying an MLP to a set of word-object pairs, where the word is teh same for the set, but the objects are different. I first implemented this using a for loop:
class Listener(nn.Module):
def __init__(self, object_size, vocab_size, wordemb_size, att_hidden_size, nonlinearity = "sigmoid"):
# inherit from torch.nn.Module
super(Listener, self).__init__()
self.word_embedder = nn.Embedding(vocab_size, wordemb_size)
# Producer of attention over visual inputs.
# MLP with concatenation of lstm output and object as input and 1 output
self.att_hidden = nn.Linear((wordemb_size + object_size), att_hidden_size)
self.attention = nn.Linear(att_hidden_size, 1)
# define nonlinearity function
if nonlinearity == "sigmoid":
self.nonlin = nn.functional.sigmoid
elif nonlinearity == "relu":
self.nonlin = nn.functional.relu
##########################################################################################
# Initialization
##########################################################################################
print("Initializing word embeddings")
nn.init.xavier_uniform_(self.word_embedder.weight, gain = 1)
print("Initializing attention MLP weights...")
nn.init.kaiming_normal_(self.att_hidden.weight, mode='fan_in', nonlinearity=nonlinearity)
nn.init.kaiming_normal_(self.attention.weight, mode='fan_in', nonlinearity=nonlinearity)
print("Initializing bias terms to all 0...")
for name, param in self.named_parameters():
if 'bias' in name:
print(name)
nn.init.constant_(param, 0.0)
def forward(self, language_input, objects):
batchsize = language_input.shape[0]
n_objects = objects.shape[1]
# get language representation
embeds = self.word_embedder(language_input)
predictions = []
# loop over object-word pairs
for ix in range(n_objects):
concat = torch.cat((embeds, objects[:,ix,:]), dim = 1)
att_hid = self.nonlin(self.att_hidden(concat))
predictions.append(self.nonlin(self.attention(att_hid)))
preds = torch.cat(predictions, dim=1)
return preds
But then I realized it could be sped up by collapsing the batch and object dimension, and feed it to the MLP in one go:
class Listener(nn.Module):
def __init__(self, object_size, vocab_size, wordemb_size, att_hidden_size, nonlinearity = "sigmoid"):
# inherit from torch.nn.Module
super(Listener, self).__init__()
self.word_embedder = nn.Embedding(vocab_size, wordemb_size)
self.size_embed = wordemb_size
self.att_hidden = nn.Linear((wordemb_size + object_size), att_hidden_size)
self.attention = nn.Linear(att_hidden_size, 1)
# define nonlinearity function
if nonlinearity == "sigmoid":
self.nonlin = nn.functional.sigmoid
elif nonlinearity == "relu":
self.nonlin = nn.functional.relu
##########################################################################################
# Initialization
##########################################################################################
print("Initializing word embeddings")
nn.init.xavier_uniform_(self.word_embedder.weight, gain = 1)
print("Initializing attention MLP weights...")
nn.init.kaiming_normal_(self.att_hidden.weight, mode='fan_in', nonlinearity=nonlinearity)
nn.init.kaiming_normal_(self.attention.weight, mode='fan_in', nonlinearity=nonlinearity)
print("Initializing bias terms to all 0...")
for name, param in self.named_parameters():
if 'bias' in name:
print(name)
nn.init.constant_(param, 0.0)
def forward(self, language_input, objects):
batchsize = language_input.shape[0]
n_objects = objects.shape[1]
embeds = self.word_embedder(language_input)
# expand language, n_objects times along the batch dimension
words = embeds.repeat(1, n_objects)
words = words.reshape((n_objects * batchsize), self.size_embed)
# collapse object & batch dimension so that it can go through MLP
objects = objects.reshape(((batchsize*n_objects), objects.shape[2]))
# concatenate words and objects
concat = torch.cat((words, objects), dim = 1)
# return attention vector
att_hid = self.nonlin(self.att_hidden(concat))
attended = self.nonlin(self.attention(att_hid))
attended = attended.reshape((batchsize,n_objects,1)).squeeze(2)
return attended
I’m running the two versions now with exactly the same hyperparametersettings & seed, and yet the results are slightly different (loss & accuracy differ, around .02 difference at most). Did I do something wrong, or is there some reason this is expected?
edit: using pytorch 0.4.0