I am training a classification model using pre-trained distilBert model for which I used two methods.
I dumped the embeddings using this snippet:
def get_encodings(model, dataloader): model.eval() model.to(device) embeddings_df = pd.DataFrame() with torch.no_grad(): for _, data in enumerate(dataloader, 0): ids = data['ids'].to(device, dtype = torch.long) mask = data['mask'].to(device, dtype = torch.long) last_hidden_states = model(ids, mask) #.squeeze() embeddings=last_hidden_states[:,0,:].cpu().numpy() temp_df = pd.DataFrame(embeddings) embeddings_df = embeddings_df.append(temp_df, ignore_index=True) return embeddings_df
Then used MLP to train and got 72% accuracy on test.
But in Method 2 I am using the model directly inside the class. With same layers and hyperparameters I am getting very poor results. Here is my model class:
class DistillBERTClass(torch.nn.Module): # define model elements def __init__(self): super(DistillBERTClass, self).__init__() self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased", cache_dir=cache_path) # input to first hidden layer self.hidden1 = Linear(768, 256) kaiming_uniform_(self.hidden1.weight, nonlinearity='relu') self.activation = ReLU() self.bn1 = nn.BatchNorm1d(num_features=256) self.dropout_1 = torch.nn.Dropout(0.5) # output self.output = Linear(256, NUM_CLASSES) # forward propagate input def forward(self, input_ids, attention_mask): # Extract Embeddings output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask) hidden_state = output_1 pooler = hidden_state[:, 0,:] # input to first hidden layer X = self.hidden1(pooler) X = self.bn1(self.activation(X)) X = self.dropout_1(X) # output layer X = self.output(X) return X
Also I tried freezing all the 100 layers, but that further degraded the performance. The network got struck at 20-30 % accuracy. Here is my snippet for layer freezing:
# freeze transformers layers freezed_layers = 0 for name, param in list(model.named_parameters())[:100]: print(name) param.requires_grad = False freezed_layers += 1
Am I missing something ? do I need to do any modification in optimisers after freezing the layers?
The bottom line is I want to use the pre-trained model as feature extractor and only train the layers after it. The backpropagation shouldn’t go the transformer layers and their weight should remain intact so that it will give same performance as method 1