from tqdm import tqdm
import torch.nn.functional as F
import os
best_val_loss = 1e+6
model.train()
for epoch in range(num_epochs):
loss_train = 0
for data in tqdm(tokenized_dataset, desc=f"training epoch = {epoch}"):
# data['input_ids'].shape = (variable, 512)
probs = torch.tensor(0.0).to(Device)
labels = torch.tensor(data['Anorexia']).to(Device)
for i in range(0, len(data['input_ids'])):
tokens = {'input_ids': torch.LongTensor(data['input_ids'][i]).to(Device), 'attention_mask': torch.LongTensor(data['attention_mask'][i]).to(Device)}
# tokens['input_ids'].shape = (512), same for token['attention_mask']
model_output = model(**tokens)
# model_output = SequenceClassifierOutput(loss=None, logits=tensor([[-0.8730, 0.6156]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
prob = F.softmax(model_output['logits'], dim=-1)
prob = prob[:, 1].mean()#.unsqueeze(0).unsqueeze(0)
probs += prob
loss = criterion(probs, labels.float())
loss_train += loss.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f"training_loss = {loss_train/len(train_dataset)}")
this is the training loop. I am training a sequence classifier model on a 32 GB GPU.
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
The text is very long and therefore I split the text into subtext and stacked the tokenized output together. I am unable to use the data loader because of the different shapes of input_ids and attention_mask.
after 23 iterations, it gets cuda out of memory whereas I am only passing a tokenized output of shape (512) one at a time to the model.
Can someone help me to resolve the error, because I think it is not a memory issue. some memory is getting accumulated.