Bert got error of device side assert tirggerred

Sangwon_Jake · February 5, 2020, 5:10am

Train the bert with class number=5

import torch
from kobert.pytorch_kobert import get_pytorch_kobert_model

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import WarmupLinearSchedule

Setting parameters

max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 1
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5

input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

model, vocab = get_pytorch_kobert_model()

all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
pooled_output.shape

tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

class BERTDataset(Dataset):
def init(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
pad, pair):
transform = nlp.data.BERTSentenceTransform(
bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

    self.sentences = [transform([i[sent_idx]]) for i in dataset]
    self.labels = [np.int32(i[label_idx]) for i in dataset]

def __getitem__(self, i):
    return (self.sentences[i] + (self.labels[i], ))

def __len__(self):
    return (len(self.labels))

data_train = BERTDataset(train_temp[0:2000], 1, 0, tok, max_len, True, False)
data_test = BERTDataset(train_temp[2000:], 1, 0, tok, max_len, True, False)
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

class BERTClassifier(nn.Module):
def init(self,
bert,
hidden_size = 768,
num_classes=5,
dr_rate=None,
params=None):
super(BERTClassifier, self).init()
self.bert = bert
self.dr_rate = dr_rate

    self.classifier = nn.Linear(hidden_size , num_classes)
    if dr_rate:
        self.dropout = nn.Dropout(p=dr_rate)

def gen_attention_mask(self, token_ids, valid_length):
    attention_mask = torch.zeros_like(token_ids)
    for i, v in enumerate(valid_length):
        attention_mask[i][:v] = 1
    return attention_mask.float()

def forward(self, token_ids, valid_length, segment_ids):
    attention_mask = self.gen_attention_mask(token_ids, valid_length)
    
    _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
    if self.dr_rate:
        out = self.dropout(pooler)
    return self.classifier(out)

model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

Prepare optimizer and schedule (linear warmup and decay)

no_decay = [‘bias’, ‘LayerNorm.weight’]
optimizer_grouped_parameters = [
{‘params’: [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], ‘weight_decay’: 0.01},
{‘params’: [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], ‘weight_decay’: 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_step, t_total=t_total)

def calc_accuracy(X,Y):
max_vals, max_indices = torch.max(X, 1)
train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
return train_acc

%%time
for e in range(num_epochs):
train_acc = 0.0
test_acc = 0.0
model.train()
for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
optimizer.zero_grad()
token_ids = token_ids.long().to(device)
segment_ids = segment_ids.long().to(device)
valid_length= valid_length
label = label.long().to(device)
out = model(token_ids, valid_length, segment_ids)
loss = loss_fn(out, label)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
optimizer.step()
scheduler.step() # Update learning rate schedule
train_acc += calc_accuracy(out, label)
if batch_id % log_interval == 0:
print(“epoch {} batch id {} loss {} train acc {}”.format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
print(“epoch {} train acc {}”.format(e+1, train_acc / (batch_id+1)))
model.eval()
for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
token_ids = token_ids.long().to(device)
segment_ids = segment_ids.long().to(device)
valid_length= valid_length
label = label.long().to(device)
out = model(token_ids, valid_length, segment_ids)
test_acc += calc_accuracy(out, label)
#torch.cuda.reset_max_memory_cached()
print(“epoch {} test acc {}”.format(e+1, test_acc / (batch_id+1)))

RuntimeError Traceback (most recent call last)
in

~/venv/lib/python3.7/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
164 products. Defaults to False.
165 “”"
–> 166 torch.autograd.backward(self, gradient, retain_graph, create_graph)
167
168 def register_hook(self, hook):

~/venv/lib/python3.7/site-packages/torch/autograd/init.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
97 Variable._execution_engine.run_backward(
98 tensors, grad_tensors, retain_graph, create_graph,
—> 99 allow_unreachable=True) # allow_unreachable flag
100
101

RuntimeError: transform: failed to synchronize: cudaErrorAssert: device-side assert triggered

ptrblck · February 6, 2020, 1:07am

Is the model running fine on the CPU?
If so, could you rerun it via CUDA_LAUNCH_BLOCKING=1 python script.py args and post the stack trace here please?

X_Zhao · February 11, 2020, 3:43am

Have you solve the problem now? I meet similar problem with you.

XuxinLi · March 28, 2020, 3:49pm

Similar Problem, always around 7 epoch then throw a error

RuntimeError: reduce failed to synchronize: cudaErrorAssert: device-side assert triggered

the loss function is BCE , the optimize is Adam.
Have you solve the problem?