import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
class Atten_BiLSTM(nn.Module):
"""
Code source:
https://github.com/littleflow3r/attention-bilstm-for-relation-classification/blob/master/model.py
"""
def __init__(self, hyperparameters):
super(Atten_BiLSTM, self).__init__()
self.hidden_dim = hyperparameters.hidden_dim
self.batch_size = hyperparameters.batch_size
self.emb_dim = hyperparameters.embedding_dim
self.gpu = 'cuda' if torch.cuda.is_available() else 'cpu'
# vocab_size 456, embedding_dim 300
self.embedding = nn.Embedding(hyperparameters.vocab_size, hyperparameters.embedding_dim)
# embedding_dim 300, hidden_dim 256, bidirectional True
self.encoder = nn.LSTM(hyperparameters.embedding_dim,
hyperparameters.hidden_dim,
bidirectional=hyperparameters.bidirectional)
# hidden dim 256, num_classes 4
self.fc = nn.Linear(hyperparameters.hidden_dim, hyperparameters.num_classes)
# dropout 0.2
self.dropout = nn.Dropout(hyperparameters.dropout)
def attnetwork(self, encoder_out, final_hidden):
hidden = final_hidden.squeeze(0)
attn_weights = torch.bmm(encoder_out, hidden.unsqueeze(2)).squeeze(2)
soft_attn_weights = F.softmax(attn_weights, 1)
new_hidden = torch.bmm(encoder_out.transpose(1,2), soft_attn_weights.unsqueeze(2)).squeeze(2)
return new_hidden
def forward(self, sequence):
emb_input = self.embedding(sequence)
inputx = self.dropout(emb_input)
output, (hn, cn) = self.encoder(inputx)
fbout = output[:, :, :self.hidden_dim] + output[:, :, self.hidden_dim:]
fbout = fbout.permute(1,0,2).to('cuda')
fbhn = (hn[-2,:,:] + hn[-1,:,:]).unsqueeze(0)
attn_out = self.attnetwork(fbout, fbhn)
logits = self.fc(attn_out)
return logits
def train_(self, optimizer, loss_fn, train_dataset, epochs=2):
train_loss = 0.
for _ in tqdm(range(epochs)):
self.train()
epoch_loss = 0.0
for _, samples in enumerate(train_dataset):
inputs, labels = samples['inputs'], samples['outputs']
optimizer.zero_grad()
# inputs shape [128, 256] = [batch size, max seq len]
predictions = self(inputs)
# print(predictions.shape) # torch.Size([256, 4])
# print(labels.shape) # torch.Size([128, 256])
loss = loss_fn(predictions, labels)
loss.backward()
optimizer.step()
train_loss += loss.tolist()
return train_loss
atten_model = Atten_BiLSTM(hyperparams).to('cuda')
train_dataset_ = DataLoader(train_dataset, batch_size=hyperparams.batch_size, shuffle=True)
lossf = nn.CrossEntropyLoss(ignore_index=train_dataset.label2idx['<PAD>'])
opt = optim.Adam(atten_model.parameters(), lr=1e-6)
atten_model.train_(optimizer=opt, loss_fn=lossf, train_dataset=train_dataset_, epochs=2)
ERROR
ValueError: Expected input batch_size (256) to match target batch_size (128). on the line loss = loss_fn(predictions, labels)