class myLM(nn.Module):
def __init__(self, params):
super(myLM, self).__init__()
self.vocab_size = params['vocab_size']
self.d_emb = params['d_emb']
self.d_hid = params['d_hid']
self.batch_size = params['batch_size']
self.embedding = nn.Embedding(self.vocab_size, self.d_emb)
self.rnn = nn.RNN(self.d_emb, self.d_hid, num_layers=1, batch_first=True)
self.ws = nn.Linear(self.d_hid, self.d_hid)
self.wq = nn.Linear(self.d_hid, self.d_hid)
self.vs = torch.randn(self.d_hid).to(device)
self.wc = nn.Linear(2 * self.d_hid, self.d_hid)
self.out = nn.Linear(self.d_hid, self.vocab_size)
def forward(self, batch):
bsz, seq_len = batch.size() #32x40
rnn_hiddens = torch.zeros(seq_len, bsz, self.d_hid)
for idx in range(seq_len):
#batch[:, idx] -> idx th word of all samples 32
#self.embedding(batch[:, idx]) -> embs of idx th words -> 32x50
rnn_input = self.embedding(batch[:, idx]).view(bsz, 1, self.d_emb)
_, hidden = self.rnn(rnn_input)
rnn_hiddens[idx] = hidden.to(device) #1x32x50
new_hiddens = torch.zeros(seq_len, bsz, 2 * self.d_hid)
for t in range(seq_len):
context = torch.zeros_like(rnn_hiddens[0])
if t>0:
context = self.context(rnn_hiddens[t], rnn_hiddens[0:t])
ht_new = torch.tanh(torch.cat((rnn_hiddens[t], context), 1))
new_hiddens[t] = ht_new
new_hiddens = new_hiddens.to(device)
return self.out(self.wc(new_hiddens))
# t= nn.functional.softmax(op, 2)
# print(t.shape)
def context(self, ht, buffer):
ctx = torch.zeros_like(buffer[0]) #32x50
for t, hi in enumerate(buffer):
scores = self.score(buffer[t], ht).to(device) #32
scores = nn.functional.softmax(scores)
#hi-32x50
for i in range(ctx.size()[0]):
ctx[i] += scores[i] * buffer[t][i]
return ctx
def score(self, hi, ht):
hi = hi.to(device)
ht = ht.to(device)
return torch.tanh(self.ws(hi) + self.wq(ht)).mv(self.vs) #[32]
params = {}
params['vocab_size'] = len(idx_to_word)
params['d_emb'] = 50
params['d_hid'] = 50
params['batch_size'] = 32
params['epochs'] = 8
params['learning_rate'] = 0.001
lm = myLM(params)
lm.cuda()
train_lm(--data--, params, lm)
the training code is as follows:
def train_lm(dataset, params, net):
# since the first index corresponds to the PAD token, we just ignore it
# when computing the loss
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])
num_examples, seq_len = dataset.size()
batches = [(start, start + params['batch_size']) for start in\
range(0, num_examples, params['batch_size'])]
for epoch in range(params['epochs']):
ep_loss = 0.
start_time = time.time()
random.shuffle(batches)
# for each batch, calculate loss and optimize model parameters
for b_idx, (start, end) in enumerate(batches):
print(b_idx)
batch = dataset[start:end]
print(batch.size()) -#---> 32x40
preds = net(batch)
preds = preds[:, :-1, :].contiguous().view(-1, net.vocab_size)
print('pred', preds.shape) #1240xvocab
targets = batch[:, 1:].contiguous().view(-1)
print('tgt', targets.shape) #1248
loss = criterion(preds, targets) #error here
loss.backward()
optimizer.step()
optimizer.zero_grad()
ep_loss += loss
print('epoch: %d, loss: %0.2f, time: %0.2f sec, dev perplexity: %0.2f' %\
(epoch, ep_loss, time.time()-start_time, compute_perplexity(wikitext['dev'], net)))
I am not sure where I am going wrong.