How to concat laserembeddings with huggingface funnel transformers simple CLS output in pytorch sequence classification task?

mobassir94 · August 4, 2022, 8:47pm

i was approaching NLP sequence classification problem (3 classes) using huggingface transformers (funnel-transformer/large) and tensorflow.

first i created laserembedding like this :

from laserembeddings import Laser
laser = Laser()
df = pd.read_csv("mycsv.csv")
embeds = laser.embed_sentences(df['text'].values, lang='en')
write_pickle_to_file('train.pkl', embeds )

part 1 : Tensorflow version

for data preparation i use code like below :


df['text']=temp['column1']+tokenizer.sep_token+temp['column2']+tokenizer.sep_token+temp['column3']

def encode_text(texts):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        padding='max_length',
        truncation=True,
        return_token_type_ids=True,
        pad_to_max_length=True,
        max_length=cfg.max_len
    )
    
    return [np.asarray(enc_di['input_ids'], dtype=np.int64), 
            np.asarray(enc_di['attention_mask'], dtype=np.int64), 
            np.asarray(enc_di['token_type_ids'], dtype=np.int64)]

then inside training function :


x_train = encode_text(df.text.to_list())
train_ds = (
      tf.data.Dataset
      .from_tensor_slices((
          {
              "input_ids":      x_train[0], 
              "input_masks":    x_train[1],
              "input_segments": x_train[2], 
              "lasers":         np.array( train[laser_columns].values, dtype=np.float32 ) #laser_columns contains all the laser embedded columns
          }, 
       
          tf.one_hot(df["label"].to_list(), 3) #3 class
      ))
      .repeat()
      .shuffle(2048)
      .batch(cfg.batch_size)
      .prefetch(AUTO)
  )

i add laser embedding in my model like this :


def create_model():
    transformer = transformers.TFAutoModel.from_pretrained(cfg.pretrained,config=config,from_pt=True) 
    max_len=512
    # transformer
    input_ids      = Input(shape=(max_len,), dtype="int32", name="input_ids")
    input_masks    = Input(shape=(max_len,), dtype="int32", name="input_masks")
    input_segments = Input(shape=(max_len,), dtype="int32", name="input_segments")
    
    sequence_output = transformer(input_ids, attention_mask=input_masks, token_type_ids=input_segments)[0]

    cls_token = sequence_output[:, 0, :]
    
    # lasers
    lasers = Input(shape=(n_lasers,), dtype=tf.float32, name="lasers")  #n_lasers = 1024
    lasers_output = tf.keras.layers.Dense(n_lasers, activation='tanh')(lasers)

    x = tf.keras.layers.Concatenate()([cls_token, lasers_output])

    x = tf.keras.layers.Dropout(0.1)(x)
    x = tf.keras.layers.Dense(2048, activation='tanh')(x)
    x = tf.keras.layers.Dropout(0.1)(x)
    out = tf.keras.layers.Dense(3, activation='softmax')(x)
    
    model = Model(inputs=[input_ids, input_masks, input_segments, lasers], outputs=out)
    model.compile(Adam(lr=1e-5), loss=losses.CategoricalCrossentropy(), metrics=["acc", metrics.CategoricalCrossentropy(name='xentropy')])
    
    return model

now my question is, how do we do the same with pytorch for exact same problem and same dataset?

part 2 : pytorch version


df = pd.read_csv("mytrain.csv")
class myDataset(Dataset):
    def __init__(self,df, max_length, tokenizer, training=True):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.column1 = self.df['column1'].values
        self.column2 = self.df['column2'].values
        self.column3= self.df['column3'].values
        self.column4= self.df['column4'].values
        self.training = training
        
        if self.training:
            self.targets = self.df['label'].values
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        column1 = self.column1[index]
        column2= self.column2[index]
        column3= self.column3[index]
        text0 = self.column4[index]
        text1 = column1  + ' ' + column2+ ' ' + column3

        
        inputs = self.tokenizer.encode_plus(
            text1 , 
            text0 ,
            truncation = True,
            add_special_tokens = True,
            return_token_type_ids = True,
            is_split_into_words=False,
            max_length = self.max_len
        )
        
        samples = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }
        
        if 'token_type_ids' in inputs:
            samples['token_type_ids'] = inputs['token_type_ids']
          
        if self.training:
            samples['target'] = self.targets[index]
        
        return samples
collate_fn = DataCollatorWithPadding(tokenizer=CONFIG['tokenizer'])

class myModel(nn.Module):
    def __init__(self, model_name):
        super(myModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        if(True):
            print("using gradient_checkpoint...")
            self.model.gradient_checkpointing_enable()
        self.config = AutoConfig.from_pretrained(model_name)
       
        self.config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": 0.0,
                "layer_norm_eps": 1e-7,
                "add_pooling_layer": False,
                "attention_probs_dropout_prob":0.0,
            }
        )
        
        self.fc = nn.Linear(self.config.hidden_size, 3)
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids,attention_mask=mask,output_hidden_states=False)
        out = out[0][:, 0, :]
        outputs = self.fc(out)
        return outputs

and in train and validation loop i have code like this :


bar = tqdm(enumerate(dataloader), total=len(dataloader))
for step, data in bar:
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype=torch.long)
        
        batch_size = ids.size(0)
        optimizer.zero_grad()
        # forward pass with `autocast` context manager
        with autocast(enabled=True):
            outputs = model(ids, mask)
            loss = loss_fct(outputs, targets)

i would like to know where and how in my huggingface pytorch pipeline i can use the laserembedding that i created earlier and used in tensorflow huggingface model?
i would like to concat laserembeddings with funnel transformer’s simple CLS token output and train the transformers model with laser embed as extra feature in pytorch implementation exactly like i did in tensorflow example,do you know how to modify my pytorch code to make it working in pytorch? the tensorflow implementation with laserembedding concatenated above that i have posted here works good,i just wanted to do the same in pytorch implementation,your help is highly appreciated,thanks in advance