How to concat laserembeddings with huggingface funnel transformers simple CLS output in pytorch sequence classification task?

i was approaching NLP sequence classification problem (3 classes) using huggingface transformers (funnel-transformer/large) and tensorflow.

first i created laserembedding like this :

from laserembeddings import Laser
laser = Laser()
df = pd.read_csv("mycsv.csv")
embeds = laser.embed_sentences(df['text'].values, lang='en')
write_pickle_to_file('train.pkl', embeds )

part 1 : Tensorflow version

for data preparation i use code like below :


def encode_text(texts):
    enc_di = tokenizer.batch_encode_plus(
    return [np.asarray(enc_di['input_ids'], dtype=np.int64), 
            np.asarray(enc_di['attention_mask'], dtype=np.int64), 
            np.asarray(enc_di['token_type_ids'], dtype=np.int64)]

then inside training function :

x_train = encode_text(df.text.to_list())
train_ds = (
              "input_ids":      x_train[0], 
              "input_masks":    x_train[1],
              "input_segments": x_train[2], 
              "lasers":         np.array( train[laser_columns].values, dtype=np.float32 ) #laser_columns contains all the laser embedded columns
          tf.one_hot(df["label"].to_list(), 3) #3 class

i add laser embedding in my model like this :

def create_model():
    transformer = transformers.TFAutoModel.from_pretrained(cfg.pretrained,config=config,from_pt=True) 
    # transformer
    input_ids      = Input(shape=(max_len,), dtype="int32", name="input_ids")
    input_masks    = Input(shape=(max_len,), dtype="int32", name="input_masks")
    input_segments = Input(shape=(max_len,), dtype="int32", name="input_segments")
    sequence_output = transformer(input_ids, attention_mask=input_masks, token_type_ids=input_segments)[0]

    cls_token = sequence_output[:, 0, :]
    # lasers
    lasers = Input(shape=(n_lasers,), dtype=tf.float32, name="lasers")  #n_lasers = 1024
    lasers_output = tf.keras.layers.Dense(n_lasers, activation='tanh')(lasers)

    x = tf.keras.layers.Concatenate()([cls_token, lasers_output])

    x = tf.keras.layers.Dropout(0.1)(x)
    x = tf.keras.layers.Dense(2048, activation='tanh')(x)
    x = tf.keras.layers.Dropout(0.1)(x)
    out = tf.keras.layers.Dense(3, activation='softmax')(x)
    model = Model(inputs=[input_ids, input_masks, input_segments, lasers], outputs=out)
    model.compile(Adam(lr=1e-5), loss=losses.CategoricalCrossentropy(), metrics=["acc", metrics.CategoricalCrossentropy(name='xentropy')])
    return model

now my question is, how do we do the same with pytorch for exact same problem and same dataset?

part 2 : pytorch version

df = pd.read_csv("mytrain.csv")
class myDataset(Dataset):
    def __init__(self,df, max_length, tokenizer, training=True):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.column1 = self.df['column1'].values
        self.column2 = self.df['column2'].values
        self.column3= self.df['column3'].values
        self.column4= self.df['column4'].values = training
            self.targets = self.df['label'].values
    def __len__(self):
        return len(self.df)
    def __getitem__(self, index):
        column1 = self.column1[index]
        column2= self.column2[index]
        column3= self.column3[index]
        text0 = self.column4[index]
        text1 = column1  + ' ' + column2+ ' ' + column3

        inputs = self.tokenizer.encode_plus(
            text1 , 
            text0 ,
            truncation = True,
            add_special_tokens = True,
            return_token_type_ids = True,
            max_length = self.max_len
        samples = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        if 'token_type_ids' in inputs:
            samples['token_type_ids'] = inputs['token_type_ids']
            samples['target'] = self.targets[index]
        return samples
collate_fn = DataCollatorWithPadding(tokenizer=CONFIG['tokenizer'])

class myModel(nn.Module):
    def __init__(self, model_name):
        super(myModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
            print("using gradient_checkpoint...")
        self.config = AutoConfig.from_pretrained(model_name)
                "output_hidden_states": True,
                "hidden_dropout_prob": 0.0,
                "layer_norm_eps": 1e-7,
                "add_pooling_layer": False,
        self.fc = nn.Linear(self.config.hidden_size, 3)
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids,attention_mask=mask,output_hidden_states=False)
        out = out[0][:, 0, :]
        outputs = self.fc(out)
        return outputs

and in train and validation loop i have code like this :

bar = tqdm(enumerate(dataloader), total=len(dataloader))
for step, data in bar:
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype=torch.long)
        batch_size = ids.size(0)
        # forward pass with `autocast` context manager
        with autocast(enabled=True):
            outputs = model(ids, mask)
            loss = loss_fct(outputs, targets)

i would like to know where and how in my huggingface pytorch pipeline i can use the laserembedding that i created earlier and used in tensorflow huggingface model?
i would like to concat laserembeddings with funnel transformer’s simple CLS token output and train the transformers model with laser embed as extra feature in pytorch implementation exactly like i did in tensorflow example,do you know how to modify my pytorch code to make it working in pytorch? the tensorflow implementation with laserembedding concatenated above that i have posted here works good,i just wanted to do the same in pytorch implementation,your help is highly appreciated,thanks in advance