8 k80 GPU trains a classification model always stopped at epoch1, with no exception logged

cosinepi · September 22, 2020, 5:03pm

cosinepi:

import transformers
from transformers import BertModel, BertTokenizer
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import time
import logging
logging.basicConfig(filename=‘bert_classifier.log’,level=logging.DEBUG)
logging.debug(‘This message should go to the log file’)
logging.info(‘So should this’)
logging.warning(‘And this, too’)

import logging
logging.basicConfig(level=logging.ERROR)

sns.set(style=‘whitegrid’, palette=‘muted’, font_scale=1.2)
HAPPY_COLORS_PALETTE = [“#01BEFE”, “#FFDD00”, “#FF7D00”, “#FF006D”, “#ADFF02”, “#8F00FF”]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
#rcParams[‘figure.figsize’] = 12, 8
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device(“cuda” if torch.cuda.is_available() else “cpu”)

df = pd.read_csv(“”)
#df.Text = df.Title + “.” + df.Bullet + “.” + df.Description
def to_sentiment(rating):
rating = int(rating)
if rating <= 2:
return 0
elif rating == 3:
return 1
else:
return 2

class_names = [0,1,2]

PRE_TRAINED_MODEL_NAME = ‘bert-base-uncased’

tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

MAX_LEN = 150

class GPReviewDataset(Dataset):

def init(self, reviews, targets, tokenizer, max_len):
self.reviews = reviews
self.targets = targets
self.tokenizer = tokenizer
self.max_len = max_len

def len(self):
return len(self.reviews)

def getitem(self, item):
review = str(self.reviews[item])
target = self.targets[item]
encoding = self.tokenizer.encode_plus(
  review,
  add_special_tokens=True,
  max_length=self.max_len,
  return_token_type_ids=False,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',
  truncation=True
)

return {
  'review_text': review,
  'input_ids': encoding['input_ids'].flatten(),
  'attention_mask': encoding['attention_mask'].flatten(),
  'targets': torch.tensor(target, dtype=torch.long)
}
df_train, df_val = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)

def create_data_loader(df, tokenizer, max_len, batch_size):
ds = GPReviewDataset(
reviews=df.Title.to_numpy(),#df.Text.to_numpy(),
targets=df.Flag.to_numpy(),
tokenizer=tokenizer,
max_len=max_len
)

return DataLoader(
ds,
batch_size=batch_size,
num_workers=0,
drop_last=True
)

BATCH_SIZE = 128

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)

bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

class SentimentClassifier(nn.Module):

def init(self, n_classes):
super(SentimentClassifier, self).init()
self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
self.drop = nn.Dropout(p=0.3)
self.linear = nn.Linear(self.bert.config.hidden_size, n_classes)
self.out = nn.Softmax(dim=1)

def forward(self, input_ids, attention_mask):
_, pooled_output = self.bert(
input_ids=input_ids,
attention_mask=attention_mask
)
output = self.drop(pooled_output)
output_1 = self.linear(output)
return self.out(output_1)

model = SentimentClassifier(len(class_names))
model = torch.nn.DataParallel(model)
model = model.to(device)

EPOCHS = 5

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)
def show_gpu(msg):
“”"
ref: Access GPU memory usage in Pytorch - #4 by mjstevens777
“”"
def query(field):
return(subprocess.check_output(
[‘nvidia-smi’, f’–query-gpu={field}‘,
‘–format=csv,nounits,noheader’],
encoding=‘utf-8’))
def to_int(result):
return int(result.strip().split(’\n’)[0])
used = to_int(query('memory.used'))
total = to_int(query('memory.total'))
pct = used/total
logger.info('\n' + msg, f'{100*pct:2.1f}% ({used} out of {total})')   
def train_epoch(
model,
data_loader,
loss_fn,
optimizer,
device,
scheduler,
n_examples
):
model = model.train()

losses =
correct_predictions = 0
index = 0
for d in data_loader:
try:
index = index + 1
input_ids = d[“input_ids”].to(device)
attention_mask = d[“attention_mask”].to(device)
targets = d[“targets”].to(device)
    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)

    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
    if index%50==0:
        logging.debug("finished steps: {}".format(index))
except Exception as e:
    logging.debug("error in training step: {}".format(e))
return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
model = model.eval()

losses =
correct_predictions = 0
try:
with torch.no_grad():
for d in data_loader:
input_ids = d[“input_ids”].to(device)
attention_mask = d[“attention_mask”].to(device)
targets = d[“targets”].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      
      #print(outputs)
      loss = loss_fn(outputs, targets)

      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())
except Exception as e:
logger.info(“eval error {}”.format(e))

return correct_predictions.double() / n_examples, np.mean(losses)

history = defaultdict(list)
best_accuracy = 0
start_time = time.time()
show_gpu(‘Initial GPU memory usage:’)
for epoch in range(EPOCHS):

logging.info(f’Epoch {epoch + 1}/{EPOCHS}‘)
logging.info(’-’ * 10)

train_acc, train_loss = train_epoch(
model,
train_data_loader,
loss_fn,
optimizer,
device,
scheduler,
len(df_train)
)
show_gpu(‘GPU memory usage after training model in epoch {}’.format(epoch))

logging.info(f’Train loss {train_loss} accuracy {train_acc}')
logging.info(“time taken from start: {}”.format(time.time() - start_time))

val_acc, val_loss = eval_model(
model,
val_data_loader,
loss_fn,
device,
len(df_val)
)

print(f’Val loss {val_loss} accuracy {val_acc}')
logging.info(“val accuracy {}, validation_loss {}”.format(val_acc, val_loss))

history[‘train_acc’].append(train_acc)
history[‘train_loss’].append(train_loss)
history[‘val_acc’].append(val_acc)
history[‘val_loss’].append(val_loss)

if val_acc > best_accuracy:
torch.save(model.state_dict(), ‘best_model_state_{}.bin’.format(epoch))
best_accuracy = val_acc

Hey guys tried to fine tune a bert classification model with 1.06 million data, but the training always stuck after it finishes the first epoch with no exception logged. can anybody see what is the problem with this? or any debugging idea? thanks a lot!

ptrblck · September 24, 2020, 7:41am

Could you remove the dataloading and check if your training routine would get stuck after training on random data?
If that’s not the case, could you iterate over the DataLoader alone without the model training and see, if this might cause the issue?