# Create a new model to update the embeddings according to the requirement
class Modeler(nn.Module):
def __init__(self, embed, vocab_size, embed_dim, keyword):
super(Modeler, self).__init__()
self.embeddings = nn.Embedding(vocab_size, embed_dim)
self.embeddings.weight.data.copy_(torch.from_numpy(embed))
self.embeddings.weight.requires_grad = True
self.keyword = keyword
self.linear1 = nn.Linear(embed_dim, 128)
self.linear2 = nn.Linear(128, 1)
def forward(self, input):
embed = self.embeddings(input)
k = embed.dot(self.keyword)
out = F.relu(self.linear1(embed))
out = self.linear2(out)
return out
losses = []
loss_function = nn.MSELoss()
lookup_tensor = torch.LongTensor([word_to_ix['domain']])
embed = model.embeddings(Variable(lookup_tensor))
n_model = Modeler(model.embeddings.weight.data.numpy(), len(vocab),embedding_size, embed)
optimizer = optim.SGD(n_model.parameters(), lr=0.001)
batch_size = 1000
start = 0
end = batch_size
for e in range(700):
for epoch in range(int(len(data)/batch_size)):
total_loss = torch.Tensor([0])
for word, t in data[start:end]:
word_id = word_to_ix[word]
word_var = Variable(torch.LongTensor([word_id]))
# Step 2. Recall that torch *accumulates* gradients. Before passing in a
# new instance, you need to zero out the gradients from the old
# instance
n_model.zero_grad() # Step 3. Run the forward pass, getting log probabilities over next
# words
res = n_model(word_var)
# Step 4. Compute your loss function. (Again, Torch wants the target
# word wrapped in a variable)
t = Variable(torch.FloatTensor([t]))
loss = loss_function(res, t)
print(loss, t)
# Step 5. Do the backward pass and update the gradient
loss.backward()
optimizer.step()
total_loss += loss.data
start += batch_size
end += batch_size
losses.append(total_loss)
print(np.mean(losses))
I dont see anything wrong in your implementation.
How did you verify that the embeddings are not getting updated?
Compared the original and the latter. No changes whatsover.
Can you give me an isolated script that I can run, I’m happy to investigate. I think it’s some subtle user error.
Here is a minimal reproducible example:
"""
Using nn.Embedding as LookupTable
"""
from torch import nn
from torch.autograd import Variable
import torch
class LinearMulti(nn.Module):
"""
Fetch the weight and bias from a lookup table based on agent/model id
Params:
sz_in: input layer
sz_out: output layer
model_ids: agent/model id
Returns:
Tensor [len(model_ids), sz_out]
"""
def __init__(self, nmodels, sz_in, sz_out):
super(LinearMulti, self).__init__()
self.nmodels = nmodels
self.sz_in = sz_in
self.sz_out = sz_out
if nmodels == 1:
self.linear = nn.Linear(sz_in, sz_out)
else:
# XXX: potential bug - updateGradInput is overidden,
# possible use of `register_backward_hook`
self.weight_lut = nn.Embedding(nmodels, sz_in * sz_out) # 1x3x200
self.bias_lut = nn.Embedding(nmodels, sz_out) # 1x3x20
def forward(self, input, model_ids):
"""
Params:
input: shape [len(model_ids), sz_in]
"""
if self.nmodels == 1:
return self.linear(input)
else:
weight = self.weight_lut(model_ids) # 1 x 3 x 200
weight_view = weight.view(-1, self.sz_in, self.sz_out) # 3 x 10 x 20
bias = self.bias_lut(model_ids) # 1 x 3 x 20
bias_view = bias.view(-1, self.sz_out) # 3x20
a, b = input.size()
input = input.view(a, 1, b) # 3x1x10
out = torch.matmul(input, weight_view) # 3x1x20
a, b, c = out.size()
out = out.view(a, c) #3x20
out = out.add(bias_view) # 3x20
return out
if __name__ == "__main__":
x = Variable(torch.ones(3, 4))
model = LinearMulti(3, 4, 1)
y = model.forward(x, Variable(torch.LongTensor([[1,2,1]])))
target = Variable(torch.FloatTensor([
[3],
[10],
[3],
]))
print target
print(y)
learning_rate = 1e-1
optimizer = torch.optim.Adagrad(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.MSELoss(size_average=False)
for i in range(100):
y = model.forward(x, Variable(torch.LongTensor([[1,2,1]])))
loss = loss_fn(y, target)
loss.backward(retain_graph=True)
print loss
# # Note: in the original test, the weight of l1, l2 is copied to the
# # weight of linear_multi. Then test the matmul results are the same
@Ricky_Han you have wrong code. Here’s a corrected training loop:
if __name__ == "__main__":
x = Variable(torch.ones(3, 4))
model = LinearMulti(3, 4, 1)
y = model(x, Variable(torch.LongTensor([[1,2,1]])))
target = Variable(torch.FloatTensor([[3], [10], [3]]))
print(target)
print(y)
learning_rate = 1e-1
optimizer = torch.optim.Adagrad(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.MSELoss(size_average=False)
for i in range(100):
optimizer.zero_grad()
y = model(x, Variable(torch.LongTensor([[1,2,1]])))
loss = loss_fn(y, target)
loss.backward(retain_graph=True)
optimizer.step()
print(loss)
Thank you very much. I have been staring at this code for 2 days now. Can you check why this model isn’t training:
import torch
from linear_multi import LinearMulti
from torch import nn
from torch.legacy.nn import Add, Sum, Identity
from torch.autograd import Variable
class Encoder(nn.Module):
def __init__(self, in_dim, hidsz):
super(Encoder, self).__init__()
self.lut = nn.Embedding(in_dim, hidsz) # in_dim agents, returns (batchsz, x, hidsz)
self.bias = nn.Parameter(torch.randn(hidsz))
def forward(self, inp):
x = self.lut(inp)
x = torch.sum(x, 1) # XXX: the original version is sum(2) but lua is 1-indexed
x = x.add(self.bias) # XXX:
return x
class CommNet(nn.Module):
def __init__(self, opts):
super(CommNet, self).__init__()
self.opts = opts
self.nmodels = opts['nmodels']
self.nagents = opts['nagents']
self.hidsz = opts['hidsz']
self.nactions = opts['nactions']
self.use_lstm = opts['model'] == 'lstm'
# Comm
if self.opts['comm_encoder']:
# before merging comm and hidden, use a linear layer for comm
if self.use_lstm: # LSTM has 4x weights for gates
self._comm2hid_linear = LinearMulti(self.nmodels, self.hidsz, self.hidsz * 4)
else:
self._comm2hid_linear = LinearMulti(self.nmodels, self.hidsz, self.hidsz)
# RNN: (comm + hidden) -> hidden
if self.use_lstm:
self._rnn_enc = self.__build_encoder(self.hidsz * 4)
self._rnn_linear = LinearMulti(self.nmodels, self.hidsz, self.hidsz * 4)
else:
self._rnn_enc = self.__build_encoder(self.hidsz)
self._rnn_linear = LinearMulti(self.nmodels, self.hidsz, self.hidsz)
# Action layer
self._action_linear = LinearMulti(self.nmodels, self.hidsz, self.nactions)
self._action_baseline_linear = LinearMulti(self.nmodels, self.hidsz, 1)
# Comm_out
self._comm_out_linear = LinearMulti(self.nmodels, self.hidsz, self.hidsz * self.nagents)
self._comm_out_linear_alt = LinearMulti(self.nmodels, self.hidsz, self.hidsz)
# action_comm
nactions_comm = self.opts['nactions_comm']
if nactions_comm > 1:
self._action_comm_linear = LinearMulti(self.nmodels, self.hidsz, nactions_comm)
def forward(self, inp, prev_hid, prev_cell, model_ids, comm_in):
self.model_ids = model_ids
comm2hid = self.__comm2hid(comm_in)
# below are the return values, for next time step
if self.use_lstm:
hidstate, prev_cell = self.__hidstate(inp, prev_hid, prev_cell, comm2hid)
else:
hidstate = self.__hidstate(inp, prev_hid, prev_cell, comm2hid)
action_prob, baseline = self.__action(hidstate)
comm_out = self.__comm_out(hidstate)
if self.opts['nactions_comm'] > 1:
action_comm = self.__action_comm(hidstate)
return (action_prob, baseline, hidstate, comm_out, action_comm)
else:
return (action_prob, baseline, hidstate, comm_out)
def __comm2hid(self, comm_in):
# Lua Sum(2) -> Python sum(1), shape: [batch x nagents, hidden]
comm2hid = torch.sum(comm_in, 1) # XXX: sum(2) -> 0-index
if self.opts['comm_encoder']:
comm2hid = self._comm2hid_linear(comm2hid, self.model_ids)
return comm2hid
def __hidstate(self, inp, prev_hid, prev_cell, comm2hid):
if self.opts['model'] == 'mlp' or self.opts['model'] == 'rnn':
hidstate = self._rnn(inp, prev_hid, comm2hid)
elif self.use_lstm:
hidstate, cellstate = self._lstm(inp, prev_hid, prev_cell, comm2hid)
return hidstate, cellstate
else:
raise Exception('model not supported')
return hidstate
def _lstm(self, inp, prev_hid, prev_cell, comm_in):
pre_hid = []
pre_hid.append(self._rnn_enc(inp))
pre_hid.append(self._rnn_linear(prev_hid, self.model_ids))
# if comm_in:
pre_hid.append(comm_in)
A = sum(pre_hid)
B = A.view(-1, 4, self.hidsz)
C = torch.split(B, self.hidsz, 0)
gate_forget = nn.Sigmoid()(C[0][0])
gate_write = nn.Sigmoid()(C[0][1])
gate_read = nn.Sigmoid()(C[0][2])
in2c = self.__nonlin()(C[0][3])
print gate_forget.size(), prev_cell.size()
print in2c.size(), gate_write.transpose(0,1).size()
cellstate = sum([
torch.matmul(gate_forget, prev_cell),
torch.matmul(in2c.transpose(0,1), gate_write)
])
hidstate = torch.matmul(self.__nonlin()(cellstate), gate_read)
return hidstate, cellstate
def _rnn(self, inp, prev_hid, comm_in):
pre_hid = []
pre_hid.append(self._rnn_enc(inp))
pre_hid.append(self._rnn_linear(prev_hid, self.model_ids))
# if comm_in:
pre_hid.append(comm_in)
sum_pre_hid = sum(pre_hid)
hidstate = self.__nonlin()(sum_pre_hid)
return hidstate
def __action(self, hidstate):
print 'action_linear'
print self._action_linear.weight_lut.weight
action = self._action_linear(hidstate, self.model_ids)
action_prob = nn.Softmax()(action) # was LogSoftmax
baseline = self._action_baseline_linear(hidstate, self.model_ids)
return action_prob, baseline
def __comm_out(self, hidstate):
if self.opts['fully_connected']:
# use different params depending on agent ID
comm_out = self._comm_out_linear(hidstate, self.model_ids)
else:
# this is kind of weird, need to consult paper
# linear from hidsz to hidsz, then non linear, then repeat?
comm_out = hidstate
if self.opts['comm_decoder'] >= 1:
comm_out = self._comm_out_linear_alt(comm_out, self.model_ids) # hidsz -> hidsz
if self.opts['comm_decoder'] == 2:
comm_out = self.__nonlin()(comm_out)
comm_out.repeat(self.nagents, 2) # hidsz -> 2 x hidsz # original: comm_out = nn.Contiguous()(nn.Replicate(self.nagents, 2)(comm_out))
return comm_out
def __action_comm(self, hidstate):
action_comm = self._action_comm_linear(hidstate, self.model_ids)
action_comm = nn.LogSoftmax()(action_comm)
return action_comm
def __nonlin(self):
nonlin = self.opts['nonlin']
if nonlin == 'tanh':
return nn.Tanh()
elif nonlin == 'relu':
return nn.ReLU()
elif nonlin == 'none':
return Identity()
else:
raise Exception("wrong nonlin")
def __build_encoder(self, hidsz):
# in_dim = ((self.opts['visibility']*2+1) ** 2) * self.opts['nwords']
in_dim = 1
if self.opts['encoder_lut']: # if there are more than 1 agent, use a LookupTable
return Encoder(in_dim, hidsz)
else: # if only 1 agent
return nn.Linear(in_dim, hidsz)
# import logging as log
# # set logger
# log.basicConfig(level=log.INFO, filename="leaver_train.log")
# console = log.StreamHandler()
# console.setLevel(log.INFO)
# log.getLogger("").addHandler(console)
import numpy as np
from model import CommNet
from torch.autograd import Variable
from torch import nn
import torch
N_AGENTS = 3
BATCH_SIZE = 1
LEVER = 3
HIDSZ = 3
def train(episode):
opts = {
'comm_encoder': True,
'nonlin': 'tanh',
'nactions_comm': 0,
'nwords': 1,
'encoder_lut_nil': None,
'encoder_lut': True,
'hidsz': HIDSZ,
'nmodels': N_AGENTS * 2,
'nagents': N_AGENTS,
'nactions': LEVER,
'model': 'mlp',
'batch_size': BATCH_SIZE,
'fully_connected': True,
'comm_decoder': 0,
}
actor = CommNet(opts).cuda()
print(actor)
inp = Variable(torch.zeros(BATCH_SIZE * N_AGENTS, 1).type(torch.LongTensor)) # input is none
prev_hid = Variable(torch.zeros(BATCH_SIZE * N_AGENTS, HIDSZ)
.type(torch.FloatTensor))
prev_cell = Variable(torch.zeros(BATCH_SIZE * N_AGENTS, HIDSZ))
comm_in = Variable(
torch.zeros(BATCH_SIZE * N_AGENTS,
N_AGENTS,
HIDSZ)
.type(torch.FloatTensor))
learning_rate = 1e-7
optimizer = torch.optim.Adagrad(actor.parameters(), lr=learning_rate)
loss_fn = torch.nn.MSELoss(size_average=False)
# one hot for mapping action
emb = nn.Embedding(1, 5).cuda()
emb.weight.data = torch.eye(5).cuda()
# clip = 1e-1
# torch.nn.utils.clip_grad_norm(actor.parameters(), clip)
# torch.nn.utils.clip_grad_norm(actor._action_baseline_linear.parameters(), clip)
# # torch.nn.utils.clip_grad_norm(actor._action_comm_linear.parameters(), clip)
# torch.nn.utils.clip_grad_norm(actor._action_linear.parameters(), clip)
# torch.nn.utils.clip_grad_norm(actor._comm_out_linear.parameters(), clip)
# torch.nn.utils.clip_grad_norm(actor._comm2hid_linear.parameters(), clip)
# torch.nn.utils.clip_grad_norm(actor._comm_out_linear_alt.parameters(), clip)
# torch.nn.utils.clip_grad_norm(actor._rnn_enc.parameters(), clip)
# torch.nn.utils.clip_grad_norm(actor._rnn_linear.parameters(), clip)
# torch.nn.utils.clip_grad_norm(actor._action_baseline_linear.parameters(), clip)
for i in range(episode):
print i
optimizer.zero_grad()
ids = np.array([np.random.choice(N_AGENTS, LEVER, replace=False)
for _ in range(BATCH_SIZE)])
# ids shape: [BATCH_SIZE, 5]
model_ids = Variable(torch.from_numpy(np.reshape(ids, (1, -1))))
action_prob, _baseline, prev_hid, comm_in = actor.forward(inp.cuda(),
prev_hid.cuda(),
prev_cell.cuda(),
model_ids.cuda(),
comm_in.cuda())
comm_in = comm_in.view(BATCH_SIZE, N_AGENTS, N_AGENTS, HIDSZ)
comm_in = comm_in.transpose(1, 2)
comm_in = comm_in.contiguous().view(BATCH_SIZE * N_AGENTS, N_AGENTS, HIDSZ)
lever_output = torch.multinomial(action_prob, 1)
lever_ids = lever_output.view(BATCH_SIZE, LEVER)
print lever_ids
one_hot = emb(lever_ids) # 1x5x5
distinct_sum = (one_hot.sum(1) > 0).sum(1).type(torch.FloatTensor)
reward = distinct_sum / LEVER
loss = - reward
# batch_actions = action_prob.sum(0)
# target = torch.ones(5) * BATCH_SIZE
# loss = loss_fn(batch_actions, Variable(target, requires_grad=False))
print(reward.sum(0) / BATCH_SIZE)
repeat_reward = reward.view(1, BATCH_SIZE).data.repeat(1, LEVER).view(BATCH_SIZE * LEVER, 1)
lever_output.reinforce(repeat_reward.cuda())
loss.backward(retain_graph=True)
optimizer.step()
# reward = env.step(action_prob)
# actor.train(ids, base_line=baseline, base_reward=reward, itr=i, log=log)
# critic.train(ids, base_reward=reward, itr=i, log=log)
if __name__ == "__main__":
train(10000)
The weights are not updated in the printout.
Hi there,
i am experiencing a similar problem in that the embeddings in my model don’t seem to change at all. As an example to find out what is going on I actually used the pytorch code from the word embeddings tutorial. This is what I am running:
import torch
import torch.nn.functional as F
from torch import nn, optim, autograd
import numpy as np
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10
test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()
trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
for i in range(len(test_sentence) - 2)]
print(trigrams[:3])
vocab = set(test_sentence)
word_to_ix = {word: i for i, word in enumerate(vocab)}
class NGramLanguageModeler(nn.Module):
def __init__(self, vocab_size, embedding_dim, context_size):
super(NGramLanguageModeler, self).__init__()
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
**self.embeddings.weight.requires_grad = True**
self.linear1 = nn.Linear(context_size * embedding_dim, 128)
self.linear2 = nn.Linear(128, vocab_size)
def forward(self, inputs):
embeds = self.embeddings(inputs).view((1, -1))
out = F.relu(self.linear1(embeds))
out = self.linear2(out)
log_probs = F.log_softmax(out)
return log_probs
losses = []
criterion = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)
for epoch in range(100):
total_loss = torch.Tensor([0])
for context, target in trigrams:
**print('----------------------------------------')**
**print('Before update')**
**before = model.embeddings.weight.data.numpy()**
context_var = autograd.Variable(torch.LongTensor(context_idxs))
model.zero_grad()
log_probs = model(context_var)
loss = criterion(log_probs, autograd.Variable(torch.LongTensor([word_to_ix[target]])))
loss.backward()
optimizer.step()
**print('After update')**
**after = model.embeddings.weight.data.numpy()**
**# calculate the diff between the weights before an update and after the update**
**print(np.nonzero(before-after)) # show if something has changed**
total_loss += loss.data
losses.append(total_loss)
print(losses)
This is pretty much the exact example code from the tutorial. To show the change in the weights I added a few lines of code. I highlighted the lines (the lines beginning and ending with **) that I added in order to check if the embedding weights actually get updated after taking one (or at least multiple) optimizer step(s). I explicitly set the self.embeddings.weight.requires_grad = True
but that didn’t change anything (still no visible updates).
I guess it simply is some stupid mistake that I make in the code, or I am trying to do it the wrong way. It would be great if someone could take a look and help me out on this.
Thanks
Like I said it is due to a stupid programming mistake. Since I am assigning the weights to the variables before
and after
I am accessing the same object using a reference. So both weight matrices are exactly the same. So logically if I subtract them from another the result will always be zero. What I needed to do in order for it to work is create a copy of the embedding weights by using the clone
method. So the call should look like before = model.embeddings.weight.clone()
. Now I was able to calculate the diff between the old version of the weights (the ones I cloned) and the new ones after the updates. The result reflects that the weight updates take place as they should.
So double check
Hi Michael,
Why update gradient in the inner loop? Why not update gradient in each epoch?
You could do that. But to only see if the embeddings change at all I guess both ways are fine.
Thank you. I experienced the same thing.