Not able to replicate theano model code

Hi

I am trying out pytorch by implementing a model written originally in theano
Here is the original model


Here is my pytorch implementation

I am not able to reproduce the results and I think this has to do with the fact that

  • For implementing a custom recurrence over time, the theano uses scan function while I am using a for loop and maybe due to some reason the backpropogation doesn’t happen properly although i see a constant decrease in loss towards convergence.

  • I had to implement a custom loss function, which I added together into the same forward function that computes the output. I’m not sure if this is causes any problem in the behind because most examples i see online are using an inbuilt loss function

  • I read the use of detach and used it in a place that I thought it was needed but I am not sure if it is needed in any other place

Incase someone has implemented this paper and has any feedback as to where I am making errors would be great to receive those.

Thanks

import torch.nn as nn
import torch.nn.init as init
import torch, copy, random, time, pdb, numpy as np
from torch.autograd import Variable
import torch.nn.functional as F

from util import *
from torch import optim
from itertools import ifilter

class Config(object):
    def __init__(self, compared=[], **kwargs):
        self.name = "rmn"
        self.word_drop = 0.75
        self.desc_dim = 30
        self.book_dim = 50
        self.num_negs = 50
        self.char_dim = 50
        self.alpha_train_point = 15
        self.train_epochs = 20
        self.alpha_init_val = 0.5
        self.eval = False
        self.vocb_size = None
        self.emb_dim = None       
        self.num_books = None
        self.num_chars = None
        
    def __repr__(self):
        ks = sorted(k for k in self.__dict__ if k not in ['name'])
        return '\n'.join('{:<30s}{:<s}'.format(k, str(self.__dict__[k])) for k in ks)

# ---global data and config initializations---
BATCH_SIZE, prc_batch_cn = 50, 0
span_data, span_size, wmap, cmap, bmap = load_data('data/relationships.csv.gz', 'data/metadata.pkl')
config = Config()
"""
it is basically dividing each word representation by the sqrt(sum(x_i^2))
so we have 16414, 300 divided by 16414, 1 ...one normalizer for each word of the vocabulary
it is basically making it a unit vector for each word, so we are ignoring vector length/magnitude and only relying on 
direction to use it in downstream tasks like similarity calculation and so on
"""
We = cPickle.load(open('data/glove.We', 'rb')).astype('float32')
We = torch.from_numpy(We)
We = F.normalize(We)

config.vocab_size, config.emb_dim, d_word = We.size(0), We.size(1), We.size(1)

config.num_chars = len(cmap)    
config.num_books = len(bmap)
config.vocab_size = len(wmap)

# this is basically one data point, where it is in turn composed of multiple time steps or spans
num_traj = len(span_data)
revmap = {}
for w in wmap:
    revmap[wmap[w]] = w
# ---initialization close

class RMNModel(nn.Module):        
    def __init__(self, config, emb_data):
        super(RMNModel, self).__init__()
        # the embedding layer to lookup the pre-trained glove embedding of span words
        self.w_embed = nn.Embedding(config.vocab_size, config.emb_dim)
        self.w_embed.weight.requires_grad = False
        self.w_embed.weight.data.copy_(emb_data)
        
        self.c_embed = nn.Embedding(config.num_chars, config.char_dim)
        self.b_embed = nn.Embedding(config.num_books, config.book_dim)

        self.softmax = nn.Softmax()
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(config.word_drop)
        
        self.w_d_h = nn.Linear(config.emb_dim, config.desc_dim, bias=False)
        self.w_d_prev = nn.Linear(config.desc_dim, config.desc_dim, bias=False)
        self.w_rel = nn.Linear(config.desc_dim, config.emb_dim, bias=False)
        
        # the below 3 layers form the transformation from individual span rep to h_in
        self.w_c_to_emb = nn.Linear(config.char_dim, config.emb_dim, bias=False)
        self.w_b_to_emb = nn.Linear(config.book_dim, config.emb_dim, bias=False)
        self.w_vs_to_emb = nn.Linear(config.emb_dim, config.emb_dim, bias=True)
        
        self.v_alpha = nn.Linear(config.emb_dim*2 + config.desc_dim, 1, bias=False)
        self.alpha = Variable(config.alpha_init_val * torch.ones(1,), requires_grad=False)
        if torch.cuda.is_available():
            self.alpha = self.alpha.cuda()
        self.train_alpha = False
        
    def set_train_alpha(self, val):
        self.train_alpha = val
        
    def update_alpha(self, input):
        self.alpha = self.sigmoid(self.v_alpha(Variable(input.data)))
        #this is batch_size * 1        
        
    # the dimension of input is T * B * S where T is the max number of spans available for a given (c1,c2,b) that
    # is considered in a batch B is the batch size and S is the max span size or the
    def forward(self, input):
        # seq is size N * M where N = batch size and M = max sequence length
        bk_id, char_ids, seq, seq_mask, neg_seq, neg_seq_mask, spans_count_l = input                
        drop_mask = self.dropout(seq_mask)
        if self.training:
            drop_mask = drop_mask * (1 - config.word_drop)
        # v_s has dimension say 8 * 116 * 300
        # is of size N * M * 300
        v_s = self.w_embed(seq)

        temp_ones = Variable(torch.ones(drop_mask.size(0), 1)).cuda()

        # mean out the sequence dimension
        seq_mask = seq_mask.unsqueeze(2)
        v_s_mask = v_s * seq_mask
        seq_mask_sums = torch.sum(seq_mask, 1)
        seq_mask_sums = torch.max(seq_mask_sums, temp_ones)        
        v_s_mask = torch.sum(v_s_mask, 1) / seq_mask_sums
            
        drop_mask = drop_mask.unsqueeze(2)
        drop_mask_sums = torch.sum(drop_mask, 1)
        drop_mask_sums = torch.max(drop_mask_sums, temp_ones)
        
        v_s_dropmask = v_s * drop_mask
        v_s_dropmask = torch.sum(v_s_dropmask, 1) / drop_mask_sums
        v_s_dropmask = self.w_vs_to_emb(v_s_dropmask)
        # now v_s is of size (8, 300) one word embedding for each span
        
        
        if neg_seq is not None:
            v_n = self.w_embed(neg_seq)
            #the negative words are not dropped out
            neg_seq_mask = neg_seq_mask.unsqueeze(2)
            v_n = v_n * neg_seq_mask
            v_n = torch.sum(v_n, 1) / torch.sum(neg_seq_mask, 1)
            
        v_b, v_c = self.b_embed(bk_id), self.c_embed(char_ids)
        # returns vars of size 1*50 and 1*2*50
        c1_var = v_c[:,0,:]
        c2_var = v_c[:,1,:]
        v_b, v_c_1, v_c_2 = self.w_b_to_emb(v_b), self.w_c_to_emb(c1_var), self.w_c_to_emb(c2_var)
        # v_c_1 is of size N*300 and v_b of N*300
        v_c = v_c_1 + v_c_2
        
        if spans_count_l is not None:
            # the second dimension is basically storing the maximum number of time steps that we can have for any data point
            seq_in = Variable(torch.zeros(BATCH_SIZE, max(spans_count_l), 300))
            seq_in_dp = Variable(torch.zeros(BATCH_SIZE, max(spans_count_l), 300))
            neg_seq_in = Variable(torch.zeros(BATCH_SIZE, config.num_negs, 300))

            if torch.cuda.is_available():
                seq_in = seq_in.cuda()
                seq_in_dp = seq_in_dp.cuda()
                neg_seq_in = neg_seq_in.cuda()
                
            cum_spans_count = 0
            cntr = 0
            for i in spans_count_l:
                # for the original with only sequence mask
                cur_seqq = v_s_mask[cum_spans_count:(cum_spans_count + i), :]
                if i != max(spans_count_l):
                    pad_res = torch.cat((cur_seqq, Variable(torch.zeros(max(spans_count_l) - i, 300)).cuda()), 0)
                    seq_in[cntr, :, :] = pad_res
                else:
                    seq_in[cntr, :, :] = cur_seqq
                
                # for the original with dropout and sequence mask both
                cur_seqq_dp = v_s_dropmask[cum_spans_count:(cum_spans_count + i), :]
                if i != max(spans_count_l):
                    pad_res_dp = torch.cat((cur_seqq_dp, Variable(torch.zeros(max(spans_count_l) - i, 300)).cuda()), 0)         
                    seq_in_dp[cntr, :, :] = pad_res_dp
                else:
                    seq_in_dp[cntr, :, :] = cur_seqq_dp
                
                if neg_seq is not None:
                    neg_seq_in[cntr,:,:] = v_n[cntr*config.num_negs:(cntr + 1)*config.num_negs, :]
                cum_spans_count += i
                cntr += 1
            if neg_seq is not None:
                del v_n
        del v_s

        # initalize
        total_loss = 0
        prev_d_t = Variable(torch.zeros(BATCH_SIZE, config.desc_dim), requires_grad=False)
        zrs = Variable(torch.zeros(BATCH_SIZE, config.num_negs), requires_grad=False)
        if torch.cuda.is_available():
            zrs = zrs.cuda()        
            prev_d_t = prev_d_t.cuda()
            
        trajn = []
        # compute the d_t vectors in parallel
        for t in range(max(spans_count_l)):
            # the dropout one is used here to calculate the mixed span representation
            v_st_dp = seq_in_dp[:, t, :].detach()            
            # the default only seq mask and no dropout applied is used to calculate the loss
            v_st_mask = seq_in[:, t, :].detach()
            
            # 20 * 300
            h_in = v_st_dp + v_b + v_c
            h_t = self.relu(h_in)     
            d_t = self.alpha * self.softmax(self.w_d_h(h_t) + self.w_d_prev(prev_d_t))  + (1 - self.alpha) * prev_d_t
            # dt is of size batch_Size * 30
            sv = np.sum(np.isnan(d_t.data.cpu().numpy()).astype(int))
            if sv > 0:
                #pdb.set_trace()
                print("got nan in d_t")

            # size is 1 * 300
            if self.train_alpha:                
                self.update_alpha(torch.cat((h_t, d_t, v_st_dp), 1))
                sv2 = np.sum(np.isnan(self.alpha.data.cpu().numpy()).astype(int))
                if sv2 > 0:
                    print("got nan in alpha")
                    #pdb.set_trace()
            # save the relationship state for each time step and return it as the trajectory for the given data point
            # each data point corresponds to a single character pair and book and all spans of it
            if config.eval:
                trajn.append(d_t.data.cpu()) # move it out of gpu memory
            if neg_seq is None:
                continue
            
            # this is the reconstruction vector made using the dictionary and the hidden state vector d_t
            r_t = self.w_rel(d_t) # is of size BATCH * 300
            # normalization here            
            r_t = F.normalize(r_t) 
            v_st_mask = F.normalize(v_st_mask) # default is euclidean along the dim=1
            neg_seq_in = F.normalize(neg_seq_in, 2, 2) # default eps is 1e-12
            
            # this is the negative loss in the max margin equation
            # BATCH_SIZE * NUM_NEG * 300 times BATCH_SIZE * 1 * 300
            #v_n_res = torch.bmm(neg_seq_in, r_t.unsqueeze(2)).squeeze(2)
            v_n_res = neg_seq_in * r_t.unsqueeze(1)
            v_n_res = torch.sum(v_n_res, 2)
            # BATCH_SIZE * NUM_NEG
            
            # each of these is a matrix of size BATCH_SIZE * 300
            # we are doing a similarity between the two vectors like a dot product
            recon_loss = r_t * v_st_mask
            recon_loss = torch.sum(recon_loss, 1, keepdim=True)
            # now the recon loss is of size BATCH_SIZE * 1
                
            cur_loss = torch.sum(torch.max(zrs, 1 - recon_loss + v_n_res), 1)
            # this is batch_size * 1
            # this mask is for removing data points which dont have a valid value for this time step
            mask = Variable(torch.from_numpy((t < np.array(spans_count_l)).astype('float')).float()).cuda()
            loss = torch.dot(cur_loss, mask)
            total_loss += loss
            
            prev_d_t = d_t

        w_rel_mat = self.w_rel.weight
        # w_rel is a weight matrix of size d * K so we want to normalize each of the K descriptors along the 0 axis

        w_rel_mat_unit = F.normalize(w_rel_mat, 2, 0)
        w_rel_mm = torch.mm(w_rel_mat_unit.t(), w_rel_mat_unit)
        id_mat = Variable(torch.eye(w_rel_mat_unit.size(1))).cuda()
        w_rel_mm = w_rel_mm.sub(id_mat)
        
        ortho_penalty = 1e-6 * torch.norm(w_rel_mm)
        if total_loss is not None:
            total_loss += ortho_penalty
        del seq_in, seq_in_dp, neg_seq_in, prev_d_t, d_t, seq_mask, zrs
        # if you want to return multiple things put them into a list else it throws an error
        return total_loss, trajn


def train_epoch(mdl, optimizer):
    random.shuffle(span_data)
    losses, bk_l, ch_l, curr_l, cm_l, dp_l,  ns_l, nm_l, num_spans = [], [], [], [], [], [], [], [], []
    prc_batch_cn, batch_cnt = 0, 0
    #temp_data = span_data[:200]
    for book, chars, curr, cm in span_data:
        # for each relation with s spans we generate n negative spans
        ns, nm = generate_negative_samples(num_traj, span_size, config.num_negs, span_data)
        book = torch.from_numpy(book).long()
        chars = torch.from_numpy(chars).long().view(1, 2)
        curr = torch.from_numpy(curr).long()
        ns = torch.from_numpy(ns).long()

        cm = torch.from_numpy(cm)
        nm = torch.from_numpy(nm)
        # word dropout

        if torch.cuda.is_available():
            book = book.cuda() # one book
            chars = chars.cuda() # one pair of character
            curr = curr.cuda() # list of spans for the above relation
            cm = cm.cuda() # the sequence mask for each span 
            ns = ns.cuda()
            nm = nm.cuda()

        bk_l.append(book)
        ch_l.append(chars)
        curr_l.append(curr)
        num_spans.append(curr.size(0))
        cm_l.append(cm)
        ns_l.append(ns)
        nm_l.append(nm)

        batch_cnt += 1
        if batch_cnt % BATCH_SIZE == 0:
            batch_cnt = 0
            bk_in = Variable(torch.cat(bk_l))
            ch_in = Variable(torch.cat(ch_l))
            curr_in = Variable(torch.cat(curr_l))
            cm_in = Variable(torch.cat(cm_l))
            ns_in = Variable(torch.cat(ns_l))
            nm_in = Variable(torch.cat(nm_l))

            # call training function here to get cost and loss                            
            optimizer.zero_grad()           
            loss, _ = mdl([bk_in, ch_in, curr_in, cm_in, ns_in, nm_in, num_spans])
            prc_batch_cn += 1
            losses.append(loss.data[0])
            loss.backward()
            
            torch.nn.utils.clip_grad_norm(mdl.parameters(), 10)
            optimizer.step()                

            del bk_l[:], ch_l[:], curr_l[:], cm_l[:], ns_l[:], nm_l[:], num_spans[:]
            del bk_in, ch_in, curr_in, cm_in, ns_in, nm_in
                
    if len(num_spans) > 0:
        # process the remaining element which were not the % BATCH SIZE
        global BATCH_SIZE
        BATCH_SIZE = len(num_spans)
        mdl.alpha = mdl.alpha[0].repeat(BATCH_SIZE, 1)
        
        bk_in = Variable(torch.cat(bk_l))
        ch_in = Variable(torch.cat(ch_l))
        curr_in = Variable(torch.cat(curr_l))
        cm_in = Variable(torch.cat(cm_l))
        ns_in = Variable(torch.cat(ns_l))
        nm_in = Variable(torch.cat(nm_l))
            
        # call training function here to get cost and loss                            
        optimizer.zero_grad()
        loss, _ = mdl([bk_in, ch_in, curr_in, cm_in, ns_in, nm_in, num_spans])
        
        prc_batch_cn += 1
        losses.append(loss.data[0])
        loss.backward()
        torch.nn.utils.clip_grad_norm(mdl.parameters(), 10)
        optimizer.step()                

        del bk_l[:], ch_l[:], curr_l[:], cm_l[:], ns_l[:], nm_l[:], num_spans[:]
        
    return sum(losses) / len(span_data)
        
def train(n_epochs):
    print d_word, span_size, config.desc_dim, config.vocab_size, config.num_chars, config.num_books, num_traj
    print 'compiling...'
    
    # build neural network here
    mdl = RMNModel(config, We)
    
    # enter train mode
    mdl.train()
    
    # transfer to gpu
    if torch.cuda.is_available():
        mdl.cuda()
        
    # print parameters and initialize them here
    for name, p in mdl.named_parameters():
        print(name, p.size(), p.requires_grad, type(p))
        if name == 'c_embed.weight' or name == 'b_embed.weight':
            print('init', name)
            init.normal(p)
        elif name == 'w_embed.weight':
            continue
        elif 'bias' not in name:
            print('init', name)
            init.xavier_uniform(p)
        else:
            print('init', name)
            init.constant(p, 0)
                
    params = list(filter(lambda p: p.requires_grad, mdl.parameters()))
    print('total params', len(params))
    optimizer = optim.Adam(params)
    print 'done compiling, now training...'

    min_loss = None
    for epoch in range(n_epochs):
        if epoch >= config.alpha_train_point:
            mdl.set_train_alpha(True)
            mdl.w_rel.weight.requires_grad = False
        start_time = time.time()    
        eloss = train_epoch(mdl, optimizer)
        end_time = time.time()
        print 'done with epoch: ', epoch, ' cost =', eloss, 'time: ', end_time - start_time
        if min_loss is None or eloss < min_loss:
            torch.save(mdl.state_dict(), "model_16.pth")
            torch.save(optimizer.state_dict(), "optimizer_16.pth")
        global BATCH_SIZE
        BATCH_SIZE = 50
        mdl.alpha = mdl.alpha[0].repeat(BATCH_SIZE, 1)
        
    torch.save(mdl.state_dict(), "model_16_last.pth")

"""
Since the descriptors are represented in the same 300 dimension space as that of the vocabulary
we can find nearest neighbors of the descriptor vector and select a label from the 10 most similar vocab words
"""
def save_descriptors(descriptor_log, weight_mat, We, revmap):
    We = We.numpy()
    # original weight matrix is emb_dim * desc_dim
    print 'writing descriptors...'
    R = F.normalize(weight_mat, 2, 0).cpu().numpy() # now this is of emb_dim * desc_dim

    log = open(descriptor_log, 'w')
    for ind in range(R.shape[1]):
        desc = R[:,ind]        
        # We is vocab * 300
        sims = We.dot(desc)
        # this is a short cut way to reverse the array [::-1]
        ordered_words = np.argsort(sims)[::-1]
        desc_list = [ revmap[w] for w in ordered_words[:10]]
        log.write(' '.join(desc_list) + '\n')
        print('descriptor %d:' % ind)
        print(desc_list)
    log.flush()
    log.close()

def save_trajectories(trajectory_log, span_data, bmap, cmap, mdl):
    potter_books = ['B019PIOJYU', 'B019PIOJY0', 'B019PIOJVI', 'B019PIOJV8', 'B019PIOJZE', 'B019PIOJZ4', 'B019PIOJWW']
    print 'writing trajectories...'
    tlog = open(trajectory_log, 'wb')
    traj_writer = csv.writer(tlog)
    traj_writer.writerow(['Book', 'Char 1', 'Char 2', 'Span ID'] + \
        ['Topic ' + str(i) for i in range(30)])
    bc = 0
    print(len(span_data))
    for book, chars, curr, cm in span_data:
        c1, c2 = [cmap[c] for c in chars]
        bname = bmap[book[0]]
        if bname != 'Dracula' and bname != 'BourneBetrayal' and bname != 'RisingTides' and bname != 'BourneDeception':
            continue
        if c1 != 'Arthur' and c2 != 'Arthur':
            continue
        book = torch.from_numpy(book).long()
        chars = torch.from_numpy(chars).long().unsqueeze(0)

        curr = torch.from_numpy(curr).long()
        cm = torch.from_numpy(cm)
            
        if torch.cuda.is_available():
            book = Variable(book).cuda()
            chars = Variable(chars).cuda()
            curr = Variable(curr).cuda()
            cm = Variable(cm).cuda()
            
        _, traj = mdl([book, chars, curr, cm, None, None, [cm.size(0)]])
        print("{} {} {} {}".format(bname, c1, c2, len(traj)))
        for ind in range(len(traj)):
            step = traj[ind].squeeze(0)
            traj_writer.writerow([bname, c1, c2, ind, step.numpy().tolist()])
        bc += 1
        if bc > 5:
            break
    tlog.flush()
    tlog.close()

def test():    
    global BATCH_SIZE
    BATCH_SIZE = 1
    print 'loading data...'
    descriptor_log = 'descriptors_model_16.log'
    trajectory_log = 'trajectories_16.log'        
    print d_word, span_size, config.desc_dim, config.vocab_size, config.num_chars, config.num_books, num_traj
    config.eval = True
    mdl = RMNModel(config, We)
    if torch.cuda.is_available():
        mdl.cuda()        
    saved_state = torch.load("model_16.pth")
    mdl.load_state_dict(saved_state)
    mdl.eval()
    #save_trajectories(trajectory_log, span_data, bmap, cmap, mdl)    
    save_descriptors(descriptor_log, mdl.w_rel.weight.data, We, revmap)

if __name__ == '__main__':
    train(config.train_epochs)
    #test()