HuggingFace transformers model tGPU training very slow

I am new to Pytorch and just wrote a model for binary classifcation using huggingface roberta model. Normally it will take 200-300ms for one iteration in tensorflow, but right now it almost 1s for each iteration. Am I doing something wrong here? Thanks!

This is my model structure

import transformers as tfm
import torch as T
import torch.nn as nn
import torch
from tqdm import tqdm

LR = 5e-5
DROPOUT_RATE = 0.25

class TweetModel(tfm.RobertaModel):
    def __init__(self, conf):
        super().__init__(conf)
        self.backbone = tfm.RobertaModel.from_pretrained(PRETRAINED_PATH, config=conf)
        self.backbone.resize_token_embeddings(TOKENIZER.get_vocab_size())
        self.drop_out = nn.Dropout(DROPOUT_RATE)
        self.gm_exp_avg = nn.parameter.Parameter(T.ones(1)*3)
        self.gm_exp_max = nn.parameter.Parameter(T.ones(1)*3)
        self.gem_avg = nn.AvgPool1d(128)
        self.gem_max = nn.MaxPool1d(128)
        self.dense = nn.Linear(768*4, 1) 
        # nn.init.normal_(self.dense.weight, std=0.02)
    
    def forward(self, ids, mask, token_type_ids):
        out = self.backbone(
            input_ids=ids,
            attention_mask=mask,
            token_type_ids=token_type_ids
        )

        out_seqs = [self.drop_out(x) for x in out[2][-2:]]
        out_seqs = T.cat(out_seqs, dim=-1) 
        out_seqs = T.transpose(out_seqs, 1, 2).clamp(1e-7)

        out_avg = self.gem_avg(out_seqs.pow(self.gm_exp_avg)).pow(1./self.gm_exp_avg)
        out_avg = T.squeeze(out_avg)

        out_max = self.gem_max(out_seqs.pow(self.gm_exp_max)).pow(1./self.gm_exp_max)
        out_max = T.squeeze(out_max)

        out = T.cat([out_avg, out_max], -1)
        out = self.dense(out)
        return out

Data loader

def convert_binary(data_df):
    data_df = data_df.reset_index(drop=True)
    data_dict = {
        'inputs': [], 
        'typeid': [],
        'mask': [],
        'sentiment': []
    }  
    for i in range(len(data_df)):
        input_id = [0] + TOKENIZER.encode(data_df.loc[i, 'text_cleaned']).ids + [2]

        LEN_PAD = MAXLEN - len(input_id)
        if LEN_PAD < 0:
            input_id = [0] + input_id[:(MAXLEN-2)] + [2]
            
        type_id = [0] * len(input_id) #TOKENIZER.encode(sentiment, tweet).type_ids
        mask_id = [1] * len(input_id) #TOKENIZER.encode(sentiment, tweet).attention_mask

        if LEN_PAD > 0:
            input_id = input_id + [1] * LEN_PAD # [1] for roberta [0] for bert
            type_id = type_id + [0] * LEN_PAD
            mask_id = mask_id + [0] * LEN_PAD

        data_dict['inputs'].append(input_id)
        data_dict['typeid'].append(type_id)
        data_dict['mask'].append(mask_id)
        data_dict['sentiment'].append(data_df.loc[i, 'sentiment'])
    return data_dict

class binary_gen:
    def __init__(self, data, batch_size=BATCH_SIZE, branch='train'):
        self.data = data
        self.batch_size = batch_size
        self.branch = branch
        self.steps = len(self.data) // self.batch_size
        if len(self.data) % self.batch_size != 0:
            self.steps += 1

    def __len__(self):
        return self.steps

    def __iter__(self):
        # while True:
        if self.branch == 'train':
            self.data = self.data.sample(frac=1.0).reset_index(drop=True)
        for i in range(self.steps):
            d = self.data[i * self.batch_size: (i + 1) * self.batch_size]
            d_dict = convert_binary(d)
            if self.branch == 'train':
                Inputs = T.tensor(d_dict['inputs'], dtype=T.long) #np.array([x[0] for x in d])
                Typeids = T.tensor(d_dict['typeid'], dtype=T.long) #np.array([x[1] for x in d]) 
                Masks = T.tensor(d_dict['mask'], dtype=T.long) #np.array([x[2] for x in d]) 
                Y = T.tensor(d_dict['sentiment'], dtype=T.long)
                yield Inputs, Typeids, Masks, Y
            else:     
                Inputs = T.tensor(d_dict['inputs'], dtype=T.long) #np.array([x[0] for x in d])
                Typeids = T.tensor(d_dict['typeid'], dtype=T.long) #np.array([x[1] for x in d]) 
                Masks = T.tensor(d_dict['mask'], dtype=T.long) #np.array([x[2] for x in d]) 
                yield Inputs, Typeids, Masks

Here is my training steps:

device = T.device("cuda")
model_config = tfm.RobertaConfig.from_pretrained(PRETRAINED_PATH, output_hidden_states=True)
model = TweetModel(conf=model_config)
model = model.to(device)

tr, val = train_test_split(external2, test_size=0.1, stratify=external2.sentiment, random_state=0)
tr, val = tr.reset_index(drop=True), val.reset_index(drop=True)
tr_data, val_data= binary_gen(tr), binary_gen(val, branch='valid')

num_train_steps = int(len(tr) / BATCH_SIZE * NUM_EPOCHS)
optimizer = tfm.AdamW(model.parameters(), lr=LR)
scheduler = tfm.get_cosine_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=int(0.1*num_train_steps), 
    num_training_steps=num_train_steps
)

def train_binary(data_loader, model, optimizer, device, scheduler=None):

    model.train()
    tk0 = tqdm(data_loader.__iter__(), total=len(data_loader), position=0, ncols=70)
    losses = []

    for bi, d in enumerate(tk0):

        ids = d[0].to(device, T.long)
        token_type_ids = d[1].to(device, T.long)
        mask = d[2].to(device, T.long)
        targets = d[3].reshape(-1, 1).to(device, T.float32)

        optimizer.zero_grad()
        outputs = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids,
        )

        loss = nn.BCEWithLogitsLoss()(outputs, targets)
        losses.append(loss.item())

        outputs = T.sigmoid(outputs).cpu().detach().numpy()
        loss.backward()
        optimizer.step()
        scheduler.step()
        tk0.set_postfix(loss='{:.4f}'.format(np.mean(losses)))