Using loop yields different results

I have implemented the 'same; model two ways. I’m applying an MLP to a set of word-object pairs, where the word is teh same for the set, but the objects are different. I first implemented this using a for loop:

class Listener(nn.Module):
    def __init__(self, object_size, vocab_size, wordemb_size, att_hidden_size, nonlinearity = "sigmoid"):

        # inherit from torch.nn.Module
        super(Listener, self).__init__() 

        self.word_embedder = nn.Embedding(vocab_size, wordemb_size)

        # Producer of attention over visual inputs.
        # MLP with concatenation of lstm output and object as input and 1 output
        self.att_hidden = nn.Linear((wordemb_size + object_size), att_hidden_size)
        self.attention = nn.Linear(att_hidden_size, 1)

        # define nonlinearity function
        if nonlinearity == "sigmoid":
            self.nonlin = nn.functional.sigmoid
        elif nonlinearity == "relu":
            self.nonlin = nn.functional.relu

        ##########################################################################################
        # Initialization
        ##########################################################################################
        print("Initializing word embeddings")
        nn.init.xavier_uniform_(self.word_embedder.weight, gain = 1)

        print("Initializing attention MLP weights...")
        nn.init.kaiming_normal_(self.att_hidden.weight, mode='fan_in', nonlinearity=nonlinearity)
        nn.init.kaiming_normal_(self.attention.weight, mode='fan_in', nonlinearity=nonlinearity)

        print("Initializing bias terms to all 0...")
        for name, param in self.named_parameters():
            if 'bias' in name:
                print(name)
                nn.init.constant_(param, 0.0)


    def forward(self, language_input, objects):
        batchsize = language_input.shape[0]
        n_objects = objects.shape[1]
        # get language representation
        embeds = self.word_embedder(language_input)
        predictions = []
        # loop over object-word pairs
        for ix in range(n_objects):
            concat = torch.cat((embeds, objects[:,ix,:]), dim = 1)
            att_hid = self.nonlin(self.att_hidden(concat))
            predictions.append(self.nonlin(self.attention(att_hid)))
        preds = torch.cat(predictions, dim=1)
        return preds

But then I realized it could be sped up by collapsing the batch and object dimension, and feed it to the MLP in one go:

class Listener(nn.Module):
    def __init__(self, object_size, vocab_size, wordemb_size, att_hidden_size, nonlinearity = "sigmoid"):

        # inherit from torch.nn.Module
        super(Listener, self).__init__() 

        self.word_embedder = nn.Embedding(vocab_size, wordemb_size)
        self.size_embed = wordemb_size
        
        self.att_hidden = nn.Linear((wordemb_size + object_size), att_hidden_size)
        self.attention = nn.Linear(att_hidden_size, 1)

        # define nonlinearity function
        if nonlinearity == "sigmoid":
            self.nonlin = nn.functional.sigmoid
        elif nonlinearity == "relu":
            self.nonlin = nn.functional.relu

        ##########################################################################################
        # Initialization
        ##########################################################################################
        print("Initializing word embeddings")
        nn.init.xavier_uniform_(self.word_embedder.weight, gain = 1)

        print("Initializing attention MLP weights...")
        nn.init.kaiming_normal_(self.att_hidden.weight, mode='fan_in', nonlinearity=nonlinearity)
        nn.init.kaiming_normal_(self.attention.weight, mode='fan_in', nonlinearity=nonlinearity)

        print("Initializing bias terms to all 0...")
        for name, param in self.named_parameters():
            if 'bias' in name:
                print(name)
                nn.init.constant_(param, 0.0)


    def forward(self, language_input, objects):
        batchsize = language_input.shape[0]
        n_objects = objects.shape[1]
        embeds = self.word_embedder(language_input)
        # expand language, n_objects times along the batch dimension
        words = embeds.repeat(1, n_objects)
        words = words.reshape((n_objects * batchsize), self.size_embed)
        # collapse object & batch dimension so that it can go through MLP
        objects = objects.reshape(((batchsize*n_objects), objects.shape[2]))
        # concatenate words and objects
        concat = torch.cat((words, objects), dim = 1)

        # return attention vector
        att_hid = self.nonlin(self.att_hidden(concat))
        attended = self.nonlin(self.attention(att_hid))
        attended = attended.reshape((batchsize,n_objects,1)).squeeze(2)
        return attended

I’m running the two versions now with exactly the same hyperparametersettings & seed, and yet the results are slightly different (loss & accuracy differ, around .02 difference at most). Did I do something wrong, or is there some reason this is expected?

edit: using pytorch 0.4.0