Embeddings not getting updated

# Create a new model to update the embeddings according to the requirement
class Modeler(nn.Module):
    
    def __init__(self, embed, vocab_size, embed_dim, keyword):
        super(Modeler, self).__init__()
        
        self.embeddings = nn.Embedding(vocab_size, embed_dim)
        self.embeddings.weight.data.copy_(torch.from_numpy(embed))
        self.embeddings.weight.requires_grad = True
        self.keyword = keyword
        self.linear1 = nn.Linear(embed_dim, 128)
        self.linear2 = nn.Linear(128, 1)
        
    def forward(self, input):
        embed = self.embeddings(input)
        k = embed.dot(self.keyword)
        out = F.relu(self.linear1(embed))
        out = self.linear2(out)
        return out
    
losses = []
loss_function = nn.MSELoss()
lookup_tensor = torch.LongTensor([word_to_ix['domain']])
embed = model.embeddings(Variable(lookup_tensor))
n_model = Modeler(model.embeddings.weight.data.numpy(), len(vocab),embedding_size, embed)
optimizer = optim.SGD(n_model.parameters(), lr=0.001)
batch_size = 1000
start = 0
end = batch_size 
for e in range(700):
     for epoch in range(int(len(data)/batch_size)):
        total_loss = torch.Tensor([0])
        for word, t in data[start:end]:
            word_id = word_to_ix[word]
            word_var = Variable(torch.LongTensor([word_id]))
            # Step 2. Recall that torch *accumulates* gradients. Before passing in a
            # new instance, you need to zero out the gradients from the old
            # instance
            n_model.zero_grad()            # Step 3. Run the forward pass, getting log probabilities over next
            # words
            res = n_model(word_var)
            # Step 4. Compute your loss function. (Again, Torch wants the target
            # word wrapped in a variable)
            t = Variable(torch.FloatTensor([t]))
            loss = loss_function(res, t)
            print(loss, t)
            # Step 5. Do the backward pass and update the gradient
            loss.backward()
            optimizer.step()
            total_loss += loss.data
            start += batch_size
            end += batch_size
        losses.append(total_loss)
print(np.mean(losses))

I dont see anything wrong in your implementation.

How did you verify that the embeddings are not getting updated?

Compared the original and the latter. No changes whatsover.

1 Like

Can you give me an isolated script that I can run, I’m happy to investigate. I think it’s some subtle user error.

1 Like

Here is a minimal reproducible example:

"""
Using nn.Embedding as LookupTable
"""

from torch import nn
from torch.autograd import Variable
import torch

class LinearMulti(nn.Module):
    """
    Fetch the weight and bias from a lookup table based on agent/model id
    Params:
        sz_in: input layer
        sz_out: output layer
        model_ids: agent/model id
    Returns:
        Tensor [len(model_ids), sz_out]
    """
    def __init__(self, nmodels, sz_in, sz_out):
        super(LinearMulti, self).__init__()
        self.nmodels = nmodels
        self.sz_in = sz_in
        self.sz_out = sz_out

        if nmodels == 1:
            self.linear = nn.Linear(sz_in, sz_out)
        else:
            # XXX: potential bug - updateGradInput is overidden,
            # possible use of `register_backward_hook`
            self.weight_lut = nn.Embedding(nmodels, sz_in * sz_out) # 1x3x200
            self.bias_lut = nn.Embedding(nmodels, sz_out) # 1x3x20

    def forward(self, input, model_ids):
        """
        Params:
            input: shape [len(model_ids), sz_in]
        """
        if self.nmodels == 1:
            return self.linear(input)
        else:
            weight = self.weight_lut(model_ids) # 1 x 3 x 200
            weight_view = weight.view(-1, self.sz_in, self.sz_out) # 3 x 10 x 20
            bias = self.bias_lut(model_ids) # 1 x 3 x 20
            bias_view = bias.view(-1, self.sz_out) # 3x20

            a, b = input.size()
            input = input.view(a, 1, b) # 3x1x10

            out = torch.matmul(input, weight_view) # 3x1x20

            a, b, c = out.size()
            out = out.view(a, c) #3x20
            out = out.add(bias_view) # 3x20
            return out


if __name__ == "__main__":
    x = Variable(torch.ones(3, 4))
    model = LinearMulti(3, 4, 1)
    y = model.forward(x, Variable(torch.LongTensor([[1,2,1]])))
    target = Variable(torch.FloatTensor([
        [3],
        [10],
        [3],
        ]))
    print target
    print(y)

    learning_rate = 1e-1
    optimizer = torch.optim.Adagrad(model.parameters(), lr=learning_rate)
    loss_fn = torch.nn.MSELoss(size_average=False)

    for i in range(100):
        y = model.forward(x, Variable(torch.LongTensor([[1,2,1]])))
        loss = loss_fn(y, target)
        loss.backward(retain_graph=True)
        print loss

    # # Note: in the original test, the weight of l1, l2 is copied to the
    # # weight of linear_multi. Then test the matmul results are the same

@Ricky_Han you have wrong code. Here’s a corrected training loop:

if __name__ == "__main__":
    x = Variable(torch.ones(3, 4))
    model = LinearMulti(3, 4, 1)
    y = model(x, Variable(torch.LongTensor([[1,2,1]])))
    target = Variable(torch.FloatTensor([[3], [10], [3]]))
    print(target)
    print(y)

    learning_rate = 1e-1
    optimizer = torch.optim.Adagrad(model.parameters(), lr=learning_rate)
    loss_fn = torch.nn.MSELoss(size_average=False)

    for i in range(100):
        optimizer.zero_grad()
        y = model(x, Variable(torch.LongTensor([[1,2,1]])))
        loss = loss_fn(y, target)
        loss.backward(retain_graph=True)
        optimizer.step()
        print(loss)

Thank you very much. I have been staring at this code for 2 days now. Can you check why this model isn’t training:

model.py:

import torch
from linear_multi import LinearMulti
from torch import nn
from torch.legacy.nn import Add, Sum, Identity
from torch.autograd import Variable

class Encoder(nn.Module):
    def __init__(self, in_dim, hidsz):
        super(Encoder, self).__init__()
        self.lut = nn.Embedding(in_dim, hidsz) # in_dim agents, returns (batchsz, x, hidsz)
        self.bias = nn.Parameter(torch.randn(hidsz))

    def forward(self, inp):
        x = self.lut(inp)
        x = torch.sum(x, 1) # XXX: the original version is sum(2) but lua is 1-indexed
        x = x.add(self.bias) # XXX:
        return x

class CommNet(nn.Module):
    def __init__(self, opts):
        super(CommNet, self).__init__()
        self.opts = opts
        self.nmodels = opts['nmodels']
        self.nagents = opts['nagents']
        self.hidsz = opts['hidsz']
        self.nactions = opts['nactions']
        self.use_lstm = opts['model'] == 'lstm'

        # Comm
        if self.opts['comm_encoder']:
            # before merging comm and hidden, use a linear layer for comm
            if self.use_lstm: # LSTM has 4x weights for gates
                self._comm2hid_linear = LinearMulti(self.nmodels, self.hidsz, self.hidsz * 4)
            else:
                self._comm2hid_linear = LinearMulti(self.nmodels, self.hidsz, self.hidsz)

        # RNN: (comm + hidden) -> hidden
        if self.use_lstm:
            self._rnn_enc = self.__build_encoder(self.hidsz * 4)
            self._rnn_linear = LinearMulti(self.nmodels, self.hidsz, self.hidsz * 4)
        else:
            self._rnn_enc = self.__build_encoder(self.hidsz)
            self._rnn_linear = LinearMulti(self.nmodels, self.hidsz, self.hidsz)

        # Action layer
        self._action_linear = LinearMulti(self.nmodels, self.hidsz, self.nactions)
        self._action_baseline_linear = LinearMulti(self.nmodels, self.hidsz, 1)

        # Comm_out
        self._comm_out_linear = LinearMulti(self.nmodels, self.hidsz, self.hidsz * self.nagents)
        self._comm_out_linear_alt = LinearMulti(self.nmodels, self.hidsz, self.hidsz)

        # action_comm
        nactions_comm = self.opts['nactions_comm']
        if nactions_comm > 1:
            self._action_comm_linear = LinearMulti(self.nmodels, self.hidsz, nactions_comm)

    def forward(self, inp, prev_hid, prev_cell, model_ids, comm_in):
        self.model_ids = model_ids
        comm2hid = self.__comm2hid(comm_in)
        # below are the return values, for next time step
        if self.use_lstm:
            hidstate, prev_cell = self.__hidstate(inp, prev_hid, prev_cell, comm2hid)
        else:
            hidstate = self.__hidstate(inp, prev_hid, prev_cell, comm2hid)

        action_prob, baseline = self.__action(hidstate)

        comm_out = self.__comm_out(hidstate)

        if self.opts['nactions_comm'] > 1:
            action_comm = self.__action_comm(hidstate)
            return (action_prob, baseline, hidstate, comm_out, action_comm)
        else:
            return (action_prob, baseline, hidstate, comm_out)

    def __comm2hid(self, comm_in):
        # Lua Sum(2) -> Python sum(1), shape: [batch x nagents, hidden]
        comm2hid = torch.sum(comm_in, 1) # XXX: sum(2) -> 0-index
        if self.opts['comm_encoder']:
            comm2hid = self._comm2hid_linear(comm2hid, self.model_ids)
        return comm2hid

    def __hidstate(self, inp, prev_hid, prev_cell, comm2hid):
        if self.opts['model'] == 'mlp' or self.opts['model'] == 'rnn':
            hidstate = self._rnn(inp, prev_hid, comm2hid)
        elif self.use_lstm:
            hidstate, cellstate = self._lstm(inp, prev_hid, prev_cell, comm2hid)
            return hidstate, cellstate
        else:
            raise Exception('model not supported')
        return hidstate

    def _lstm(self, inp, prev_hid, prev_cell, comm_in):
        pre_hid = []
        pre_hid.append(self._rnn_enc(inp))
        pre_hid.append(self._rnn_linear(prev_hid, self.model_ids))
        # if comm_in:
        pre_hid.append(comm_in)
        A = sum(pre_hid)
        B = A.view(-1, 4, self.hidsz)
        C = torch.split(B, self.hidsz, 0)

        gate_forget = nn.Sigmoid()(C[0][0])
        gate_write = nn.Sigmoid()(C[0][1])
        gate_read = nn.Sigmoid()(C[0][2])
        in2c = self.__nonlin()(C[0][3])
        print gate_forget.size(), prev_cell.size()
        print in2c.size(), gate_write.transpose(0,1).size()
        cellstate = sum([
            torch.matmul(gate_forget, prev_cell),
            torch.matmul(in2c.transpose(0,1), gate_write)
        ])
        hidstate = torch.matmul(self.__nonlin()(cellstate), gate_read)
        return hidstate, cellstate

    def _rnn(self, inp, prev_hid, comm_in):
        pre_hid = []
        pre_hid.append(self._rnn_enc(inp))

        pre_hid.append(self._rnn_linear(prev_hid, self.model_ids))
        # if comm_in:
        pre_hid.append(comm_in)

        sum_pre_hid = sum(pre_hid)
        hidstate = self.__nonlin()(sum_pre_hid)
        return hidstate

    def __action(self, hidstate):
        print 'action_linear'
        print self._action_linear.weight_lut.weight
        action = self._action_linear(hidstate, self.model_ids)
        action_prob = nn.Softmax()(action) # was LogSoftmax

        baseline =  self._action_baseline_linear(hidstate, self.model_ids)

        return action_prob, baseline

    def __comm_out(self, hidstate):
        if self.opts['fully_connected']:
            # use different params depending on agent ID
            comm_out = self._comm_out_linear(hidstate, self.model_ids)
        else:
            # this is kind of weird, need to consult paper
            # linear from hidsz to hidsz, then non linear, then repeat?
            comm_out = hidstate
            if self.opts['comm_decoder'] >= 1:
                comm_out = self._comm_out_linear_alt(comm_out, self.model_ids) # hidsz -> hidsz
                if self.opts['comm_decoder'] == 2:
                    comm_out = self.__nonlin()(comm_out)
            comm_out.repeat(self.nagents, 2) # hidsz -> 2 x hidsz # original: comm_out = nn.Contiguous()(nn.Replicate(self.nagents, 2)(comm_out))
        return comm_out

    def __action_comm(self, hidstate):
        action_comm = self._action_comm_linear(hidstate, self.model_ids)
        action_comm = nn.LogSoftmax()(action_comm)
        return action_comm


    def __nonlin(self):
        nonlin = self.opts['nonlin']
        if nonlin == 'tanh':
            return nn.Tanh()
        elif nonlin == 'relu':
            return nn.ReLU()
        elif nonlin == 'none':
            return Identity()
        else:
            raise Exception("wrong nonlin")

    def __build_encoder(self, hidsz):
        # in_dim = ((self.opts['visibility']*2+1) ** 2) * self.opts['nwords']
        in_dim = 1
        if self.opts['encoder_lut']:                   # if there are more than 1 agent, use a LookupTable
            return Encoder(in_dim, hidsz)
        else:                                          # if only 1 agent
            return nn.Linear(in_dim, hidsz)

train.py

# import logging as log
# # set logger
# log.basicConfig(level=log.INFO, filename="leaver_train.log")
# console = log.StreamHandler()
# console.setLevel(log.INFO)
# log.getLogger("").addHandler(console)
import numpy as np
from model import CommNet
from torch.autograd import Variable
from torch import nn
import torch

N_AGENTS = 3
BATCH_SIZE = 1
LEVER = 3 
HIDSZ = 3


def train(episode):
    opts = {
        'comm_encoder': True,
        'nonlin': 'tanh',
        'nactions_comm': 0,
        'nwords': 1,
        'encoder_lut_nil': None,
        'encoder_lut': True,
        'hidsz': HIDSZ,
        'nmodels': N_AGENTS * 2,
        'nagents': N_AGENTS,
        'nactions': LEVER,
        'model': 'mlp',
        'batch_size': BATCH_SIZE,
        'fully_connected': True,
        'comm_decoder': 0,
    }

    actor = CommNet(opts).cuda()
    print(actor)


    inp = Variable(torch.zeros(BATCH_SIZE * N_AGENTS, 1).type(torch.LongTensor)) # input is none
    prev_hid = Variable(torch.zeros(BATCH_SIZE * N_AGENTS, HIDSZ)
                             .type(torch.FloatTensor))
    prev_cell = Variable(torch.zeros(BATCH_SIZE * N_AGENTS, HIDSZ))

    comm_in = Variable(
        torch.zeros(BATCH_SIZE * N_AGENTS,
                   N_AGENTS,
                   HIDSZ)
             .type(torch.FloatTensor))


    learning_rate = 1e-7
    optimizer = torch.optim.Adagrad(actor.parameters(), lr=learning_rate)
    loss_fn = torch.nn.MSELoss(size_average=False)

    # one hot for mapping action
    emb = nn.Embedding(1, 5).cuda() 
    emb.weight.data = torch.eye(5).cuda()

    # clip = 1e-1
    # torch.nn.utils.clip_grad_norm(actor.parameters(), clip)
    # torch.nn.utils.clip_grad_norm(actor._action_baseline_linear.parameters(), clip)
    # # torch.nn.utils.clip_grad_norm(actor._action_comm_linear.parameters(), clip)
    # torch.nn.utils.clip_grad_norm(actor._action_linear.parameters(), clip)
    # torch.nn.utils.clip_grad_norm(actor._comm_out_linear.parameters(), clip)
    # torch.nn.utils.clip_grad_norm(actor._comm2hid_linear.parameters(), clip)
    # torch.nn.utils.clip_grad_norm(actor._comm_out_linear_alt.parameters(), clip)
    # torch.nn.utils.clip_grad_norm(actor._rnn_enc.parameters(), clip)
    # torch.nn.utils.clip_grad_norm(actor._rnn_linear.parameters(), clip)
    # torch.nn.utils.clip_grad_norm(actor._action_baseline_linear.parameters(), clip)
    for i in range(episode):
        print i
        optimizer.zero_grad()
        ids = np.array([np.random.choice(N_AGENTS, LEVER, replace=False)
                        for _ in range(BATCH_SIZE)])
        # ids shape: [BATCH_SIZE, 5]
        model_ids = Variable(torch.from_numpy(np.reshape(ids, (1, -1))))


        action_prob, _baseline, prev_hid, comm_in = actor.forward(inp.cuda(),
                                                                 prev_hid.cuda(),
                                                                 prev_cell.cuda(),
                                                                 model_ids.cuda(),
                                                                 comm_in.cuda())

        comm_in = comm_in.view(BATCH_SIZE, N_AGENTS, N_AGENTS, HIDSZ)
        comm_in = comm_in.transpose(1, 2)
        comm_in = comm_in.contiguous().view(BATCH_SIZE * N_AGENTS, N_AGENTS, HIDSZ)

        lever_output = torch.multinomial(action_prob, 1)
        lever_ids = lever_output.view(BATCH_SIZE, LEVER)
        print lever_ids
        one_hot = emb(lever_ids) # 1x5x5
        distinct_sum = (one_hot.sum(1) > 0).sum(1).type(torch.FloatTensor)
        reward = distinct_sum / LEVER

        loss = - reward

        # batch_actions = action_prob.sum(0)
        # target = torch.ones(5) * BATCH_SIZE
        # loss = loss_fn(batch_actions, Variable(target, requires_grad=False))
        print(reward.sum(0) / BATCH_SIZE)
        repeat_reward = reward.view(1, BATCH_SIZE).data.repeat(1, LEVER).view(BATCH_SIZE * LEVER, 1)
        lever_output.reinforce(repeat_reward.cuda())
        loss.backward(retain_graph=True)
        optimizer.step()
        


        # reward = env.step(action_prob)

        # actor.train(ids, base_line=baseline, base_reward=reward, itr=i, log=log)
        # critic.train(ids, base_reward=reward, itr=i, log=log)


if __name__ == "__main__":
    train(10000)

The weights are not updated in the printout.

Hi there,

i am experiencing a similar problem in that the embeddings in my model don’t seem to change at all. As an example to find out what is going on I actually used the pytorch code from the word embeddings tutorial. This is what I am running:

import torch
import torch.nn.functional as F
from torch import nn, optim, autograd
import numpy as np

CONTEXT_SIZE = 2
EMBEDDING_DIM = 10

test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()

trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
        for i in range(len(test_sentence) - 2)]

print(trigrams[:3])

vocab = set(test_sentence)
word_to_ix = {word: i for i, word in enumerate(vocab)}


class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        **self.embeddings.weight.requires_grad = True**
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out)
        return log_probs


losses = []
criterion = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in range(100):
    total_loss = torch.Tensor([0])
    for context, target in trigrams:
        **print('----------------------------------------')**
        **print('Before update')**
        **before = model.embeddings.weight.data.numpy()**

        context_var = autograd.Variable(torch.LongTensor(context_idxs))

        model.zero_grad()

        log_probs = model(context_var)

        loss = criterion(log_probs, autograd.Variable(torch.LongTensor([word_to_ix[target]])))

        loss.backward()
        optimizer.step()

        **print('After update')**
        **after = model.embeddings.weight.data.numpy()**

        **# calculate the diff between the weights before an update and after the update**
        **print(np.nonzero(before-after))  # show if something has changed**

        total_loss += loss.data
    losses.append(total_loss)
print(losses)

This is pretty much the exact example code from the tutorial. To show the change in the weights I added a few lines of code. I highlighted the lines (the lines beginning and ending with **) that I added in order to check if the embedding weights actually get updated after taking one (or at least multiple) optimizer step(s). I explicitly set the self.embeddings.weight.requires_grad = True but that didn’t change anything (still no visible updates).

I guess it simply is some stupid mistake that I make in the code, or I am trying to do it the wrong way. It would be great if someone could take a look and help me out on this.

Thanks

Like I said it is due to a stupid programming mistake. Since I am assigning the weights to the variables before and after I am accessing the same object using a reference. So both weight matrices are exactly the same. So logically if I subtract them from another the result will always be zero. What I needed to do in order for it to work is create a copy of the embedding weights by using the clone method. So the call should look like before = model.embeddings.weight.clone(). Now I was able to calculate the diff between the old version of the weights (the ones I cloned) and the new ones after the updates. The result reflects that the weight updates take place as they should.

So double check :slight_smile:

1 Like

Hi Michael,
Why update gradient in the inner loop? Why not update gradient in each epoch?

You could do that. But to only see if the embeddings change at all I guess both ways are fine.

Thank you. I experienced the same thing.