I am new to Pytorch and just wrote a model for binary classifcation using huggingface roberta model. Normally it will take 200-300ms for one iteration in tensorflow, but right now it almost 1s for each iteration. Am I doing something wrong here? Thanks!
This is my model structure
import transformers as tfm
import torch as T
import torch.nn as nn
import torch
from tqdm import tqdm
LR = 5e-5
DROPOUT_RATE = 0.25
class TweetModel(tfm.RobertaModel):
def __init__(self, conf):
super().__init__(conf)
self.backbone = tfm.RobertaModel.from_pretrained(PRETRAINED_PATH, config=conf)
self.backbone.resize_token_embeddings(TOKENIZER.get_vocab_size())
self.drop_out = nn.Dropout(DROPOUT_RATE)
self.gm_exp_avg = nn.parameter.Parameter(T.ones(1)*3)
self.gm_exp_max = nn.parameter.Parameter(T.ones(1)*3)
self.gem_avg = nn.AvgPool1d(128)
self.gem_max = nn.MaxPool1d(128)
self.dense = nn.Linear(768*4, 1)
# nn.init.normal_(self.dense.weight, std=0.02)
def forward(self, ids, mask, token_type_ids):
out = self.backbone(
input_ids=ids,
attention_mask=mask,
token_type_ids=token_type_ids
)
out_seqs = [self.drop_out(x) for x in out[2][-2:]]
out_seqs = T.cat(out_seqs, dim=-1)
out_seqs = T.transpose(out_seqs, 1, 2).clamp(1e-7)
out_avg = self.gem_avg(out_seqs.pow(self.gm_exp_avg)).pow(1./self.gm_exp_avg)
out_avg = T.squeeze(out_avg)
out_max = self.gem_max(out_seqs.pow(self.gm_exp_max)).pow(1./self.gm_exp_max)
out_max = T.squeeze(out_max)
out = T.cat([out_avg, out_max], -1)
out = self.dense(out)
return out
Data loader
def convert_binary(data_df):
data_df = data_df.reset_index(drop=True)
data_dict = {
'inputs': [],
'typeid': [],
'mask': [],
'sentiment': []
}
for i in range(len(data_df)):
input_id = [0] + TOKENIZER.encode(data_df.loc[i, 'text_cleaned']).ids + [2]
LEN_PAD = MAXLEN - len(input_id)
if LEN_PAD < 0:
input_id = [0] + input_id[:(MAXLEN-2)] + [2]
type_id = [0] * len(input_id) #TOKENIZER.encode(sentiment, tweet).type_ids
mask_id = [1] * len(input_id) #TOKENIZER.encode(sentiment, tweet).attention_mask
if LEN_PAD > 0:
input_id = input_id + [1] * LEN_PAD # [1] for roberta [0] for bert
type_id = type_id + [0] * LEN_PAD
mask_id = mask_id + [0] * LEN_PAD
data_dict['inputs'].append(input_id)
data_dict['typeid'].append(type_id)
data_dict['mask'].append(mask_id)
data_dict['sentiment'].append(data_df.loc[i, 'sentiment'])
return data_dict
class binary_gen:
def __init__(self, data, batch_size=BATCH_SIZE, branch='train'):
self.data = data
self.batch_size = batch_size
self.branch = branch
self.steps = len(self.data) // self.batch_size
if len(self.data) % self.batch_size != 0:
self.steps += 1
def __len__(self):
return self.steps
def __iter__(self):
# while True:
if self.branch == 'train':
self.data = self.data.sample(frac=1.0).reset_index(drop=True)
for i in range(self.steps):
d = self.data[i * self.batch_size: (i + 1) * self.batch_size]
d_dict = convert_binary(d)
if self.branch == 'train':
Inputs = T.tensor(d_dict['inputs'], dtype=T.long) #np.array([x[0] for x in d])
Typeids = T.tensor(d_dict['typeid'], dtype=T.long) #np.array([x[1] for x in d])
Masks = T.tensor(d_dict['mask'], dtype=T.long) #np.array([x[2] for x in d])
Y = T.tensor(d_dict['sentiment'], dtype=T.long)
yield Inputs, Typeids, Masks, Y
else:
Inputs = T.tensor(d_dict['inputs'], dtype=T.long) #np.array([x[0] for x in d])
Typeids = T.tensor(d_dict['typeid'], dtype=T.long) #np.array([x[1] for x in d])
Masks = T.tensor(d_dict['mask'], dtype=T.long) #np.array([x[2] for x in d])
yield Inputs, Typeids, Masks
Here is my training steps:
device = T.device("cuda")
model_config = tfm.RobertaConfig.from_pretrained(PRETRAINED_PATH, output_hidden_states=True)
model = TweetModel(conf=model_config)
model = model.to(device)
tr, val = train_test_split(external2, test_size=0.1, stratify=external2.sentiment, random_state=0)
tr, val = tr.reset_index(drop=True), val.reset_index(drop=True)
tr_data, val_data= binary_gen(tr), binary_gen(val, branch='valid')
num_train_steps = int(len(tr) / BATCH_SIZE * NUM_EPOCHS)
optimizer = tfm.AdamW(model.parameters(), lr=LR)
scheduler = tfm.get_cosine_schedule_with_warmup(
optimizer,
num_warmup_steps=int(0.1*num_train_steps),
num_training_steps=num_train_steps
)
def train_binary(data_loader, model, optimizer, device, scheduler=None):
model.train()
tk0 = tqdm(data_loader.__iter__(), total=len(data_loader), position=0, ncols=70)
losses = []
for bi, d in enumerate(tk0):
ids = d[0].to(device, T.long)
token_type_ids = d[1].to(device, T.long)
mask = d[2].to(device, T.long)
targets = d[3].reshape(-1, 1).to(device, T.float32)
optimizer.zero_grad()
outputs = model(
ids=ids,
mask=mask,
token_type_ids=token_type_ids,
)
loss = nn.BCEWithLogitsLoss()(outputs, targets)
losses.append(loss.item())
outputs = T.sigmoid(outputs).cpu().detach().numpy()
loss.backward()
optimizer.step()
scheduler.step()
tk0.set_postfix(loss='{:.4f}'.format(np.mean(losses)))