Hi all!
I’m trying to train a part-of-speech tagger using the data from the Brown corpus from NLTK using the universal tagset:
Here is the dataset implementation:
class BrownDataset(Dataset):
def __init__(self):
self.sents = []
self.tags = []
for tagged_sent in brown.tagged_sents(tagset="universal"):
words, tags = list(zip(*tagged_sent))
self.sents.append(words)
self.tags.append(tags)
c = Counter([word.lower() for sent in self.sents for word in sent])
self.w2i = {"<PAD>": 0, "<UNK>": 1}
for i, (w, _) in enumerate(c.most_common(VOCAB_SIZE - 2), 2):
self.w2i[w] = i
self.i2w = {i: w for w, i in self.w2i.items()}
self.t2i = {"<PAD>": 0}
for i, t in enumerate({tag for tags in self.tags for tag in tags}, 1):
self.t2i[t] = i
self.i2t = {i: t for t, i in self.t2i.items()}
def __getitem__(self, index):
return torch.tensor([self.w2i.get(w.lower(), self.w2i["<UNK>"]) for w in self.sents[index]]), torch.tensor([self.t2i[t] for t in self.tags[index]])
def __len__(self):
return len(self.sents)
My model is fairly simple, similar to the model in the PyTorch tutorial:
class PosTagger(nn.Module):
def __init__(self):
super(PosTagger, self).__init__()
self.embedding = nn.Embedding(VOCAB_SIZE, EMB_DIM)
self.lstm = nn.LSTM(EMB_DIM, HIDDEN_SIZE, batch_first=True)
self.fc = nn.Linear(HIDDEN_SIZE, TAGSET_SIZE + 1) # +1 for PAD
def forward(self, x):
x = self.embedding(x)
x, _ = self.lstm(x)
return F.log_softmax(self.fc(x), dim=1)
This is the setup code:
def collate_fn(batch):
sents, tags = list(zip(*batch))
return pad_sequence(sents, batch_first=True), pad_sequence(tags, batch_first=True)
dataset = BrownDataset()
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
model = PosTagger()
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters())
And this is the training loop:
for epoch in tqdm(range(EPOCHS)):
for sents, targets in dataloader:
model.zero_grad()
predictions = model(sents)
batch_size, seq_len, _ = predictions.shape
loss = loss_function(predictions.view(batch_size * seq_len, -1), targets.view(-1))
loss.backward()
optimizer.step()
The loss per batch is constantly fluctuating between 2 and 4. Am I doing something wrong?
Thanks!