Hi there!
I am attempting to train a biLSTM model following a tutorial to use as a binary classifier for textual data. Currently I am using glove embeddings of dim 300 with a custom embedding layer constructed from the vocab used in my corpus and the pretrained glove embeddings. The embedding layer seems to be no problem to fit into memory and the gradients are frozen to not train the embeddings further. I’ve set the max sequence length of each sequence to 256.
I’ll post my code below for those interested
My dataset class and dataloader
class TextDataset(Dataset):
def __init__(self, df):
self.df = df
def __len__(self):
return len(self.df.index)
def __getitem__(self, idx):
row = self.df.iloc[idx].values
tokens, length = row[1]
label = row[3]
return tokens, label, length
def get_class_weights(self):
total_texts = self.__len__()
num_non_shooter_texts, num_shooter_texts = self.df["label"].value_counts()
print(f"Value counts:\n{self.df['label'].value_counts()}")
non_shooter_wt = total_texts / num_non_shooter_texts
shooter_wt = total_texts / num_shooter_texts
print(f"non_shooter: {non_shooter_wt}\nshooter: {shooter_wt}")
return [non_shooter_wt, shooter_wt]
train_loader = DataLoader(train_set, batch_size=32, pin_memory=True)
The network is a two layer biLSTM
class LSTMTextClassifier(nn.Module):
def __init__(self, embs, emb_dim: int = 300, dropout: int = 0.5, hidden_size: int = 128, num_layers: int = 2):
super(LSTMTextClassifier, self).__init__()
self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(embs).float())
self.embedding.weight.requires_grad = False
self.hidden_size = hidden_size
self.lstm = nn.LSTM(emb_dim, hidden_size, num_layers, batch_first=True, bidirectional=True)
self.fc = nn.Linear(2*hidden_size, 1)
self.dropout = nn.Dropout(p=dropout)
self.sig = nn.Sigmoid()
def forward(self, x, length):
"""Perform a forward pass through the network.
Args:
x (torch.Tensor): A tensor of token ids with shape (batch_size, max_sent_length)
length: the length of the sequence before padding
"""
embs = self.embedding(x)
packed_input = pack_padded_sequence(embs, length, batch_first=True, enforce_sorted=False)
packed_out, _ = self.lstm(packed_input)
out, _ = pad_packed_sequence(packed_out, batch_first=True)
out_forward = out[range(len(out)), length - 1, :self.hidden_size]
out_backwards = out[:, 0, self.hidden_size:]
out_reduced = torch.cat((out_forward, out_backwards), 1) # Concat for fc layer and final pred
out_dropped = self.dropout(out_reduced) # Dropout layer
out = self.fc(out_dropped)
out = self.sig(out)
return out
What confuses me is that my dataset is not particularly large. It consists of around 10000 posts averaging around 100 words in length, some a bit more. Seeing as the biLSTM model is not very large either I struggle to see how I would run out o fmemory on a 32GB gpu from just a few batches / epochs depending on the batch size. I am only running 10 epochs. An epoch looks like so:
def run_epoch():
running_loss = 0.
for _, data in enumerate(train_loader):
inputs, labels, lengths = data
labels = torch.tensor(np.array(labels), dtype=torch.float32).to(device)
inputs = torch.from_numpy(np.array(inputs)).to(device)
optimizer.zero_grad()
# Weighting scheme to accomodate for small minority class
# Weights are ab. 1.2 for majority class and 6 for minority
weighting = []
for l in labels:
if l == 0:
weighting.append(class_wts[0])
else:
weighting.append(class_wts[1])
outputs = model(inputs, lengths)
loss_fn = nn.BCELoss(weight=torch.tensor(weighting))
loss = loss_fn(outputs.squeeze(dim=1), labels) # Unsqueeze target tensor to allow for batching and same dims for out and target
loss.backward()
optimizer.step()
running_loss += loss.item()
return running_loss/len(train_loader)
Any help is appreciated