Hi there, first time posting here, great place to learn. I’m trying to train a model for a kaggle competition (this one https://www.kaggle.com/c/google-quest-challenge) using the pre-trained bert model from huggingface (https://huggingface.co/transformers/quickstart.html) but the model start predicting always the same tensor on the validation data after seeing some batches. Here’s the model
class nlp(nn.Module):
def __init__(self,model,output_size):
super(nlp,self).__init__()
self.model = model
self.output_size = output_size
self.fc1 = torch.nn.Linear(1,output_size)
def forward(self,ids,segments):
x = self.model(ids, token_type_ids=segments)[0]
x = F.dropout(x,0.2)
x = F.adaptive_avg_pool2d(x, (1, 1))
x = self.fc1(x)
return x
model = BertModel.from_pretrained('bert-base-uncased').to('cuda')
nlp = nlp(model,30)
nlp.cuda()
And here are 3 different forward samples before training
tensor([[[ 0.4075, 0.7188, 0.4816, -0.9949, -0.8120, -0.5959, -0.4751,
0.9908, 0.1970, -0.0944, -0.7085, 0.9645, 0.5366, 0.9451,
0.8958, 0.2544, 0.8900, 0.6315, -0.3778, 0.8029, 0.7653,
0.2894, 0.0876, 0.4636, 0.9169, -0.3048, 0.5296, 0.5875,
-0.0777, 0.6130]],
[[ 0.4103, 0.7234, 0.4815, -0.9936, -0.8165, -0.5917, -0.4724,
0.9868, 0.1983, -0.0975, -0.7086, 0.9650, 0.5369, 0.9489,
0.8963, 0.2523, 0.8886, 0.6311, -0.3805, 0.8062, 0.7614,
0.2870, 0.0904, 0.4616, 0.9169, -0.3004, 0.5298, 0.5903,
-0.0808, 0.6147]],
[[ 0.4072, 0.7183, 0.4816, -0.9950, -0.8115, -0.5963, -0.4754,
0.9912, 0.1968, -0.0941, -0.7085, 0.9644, 0.5366, 0.9447,
0.8957, 0.2546, 0.8902, 0.6315, -0.3775, 0.8026, 0.7657,
0.2897, 0.0873, 0.4638, 0.9169, -0.3053, 0.5295, 0.5872,
-0.0773, 0.6129]]], device='cuda:0', grad_fn=<AddBackward0>)
After the train process
optimizer = optim.AdamW(nlp.parameters(),lr=0.001)
epochs = 1
data = DataLoader(segments.index,shuffle=True,batch_size=8,num_workers=4)
criterion = nn.BCEWithLogitsLoss()
to_plot_loss = []
print('Train started at ' + datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
for i in range(epochs):
for i,index in enumerate(data):
index = index.tolist()
optimizer.zero_grad()
#get segments
segments_tensor = torch.tensor(segments[index].tolist()).to('cuda')
#get ids
ids_tensor = torch.tensor(ids[index].tolist()).to('cuda')
#targets
target = torch.tensor(y_train.loc[index].values,dtype=torch.float32).to('cuda')
output = nlp(ids_tensor,segments_tensor)
loss = criterion(output.view(-1,30),target).to('cuda')
loss.backward()
optimizer.step()
if i%50 == 0:
to_plot_loss.append(loss.item())
print('Train finished at ' + datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
After that, I get these outputs on the same samples
tensor([[[ 1.2449, 0.1742, -1.1526, 0.2148, 1.5262, -0.0699, 0.2180,
0.4770, -1.2257, -0.9791, -0.2025, -0.6288, -1.2206, -1.7942,
-0.9674, -0.8788, 0.1244, -0.8601, -0.2772, -1.7893, 1.6495,
2.3030, 0.6057, 2.4578, 2.4420, 0.2222, -0.1431, -1.6405,
0.2265, 1.6829]],
[[ 1.2449, 0.1742, -1.1526, 0.2148, 1.5262, -0.0699, 0.2180,
0.4770, -1.2257, -0.9791, -0.2025, -0.6288, -1.2206, -1.7942,
-0.9674, -0.8788, 0.1244, -0.8601, -0.2772, -1.7893, 1.6495,
2.3030, 0.6057, 2.4578, 2.4420, 0.2222, -0.1431, -1.6405,
0.2265, 1.6829]]
[[ 1.2449, 0.1742, -1.1526, 0.2148, 1.5262, -0.0699, 0.2180,
0.4770, -1.2257, -0.9791, -0.2025, -0.6288, -1.2206, -1.7942,
-0.9674, -0.8788, 0.1244, -0.8601, -0.2772, -1.7893, 1.6495,
2.3030, 0.6057, 2.4578, 2.4420, 0.2222, -0.1431, -1.6405,
0.2265, 1.6829]]], device='cuda:0', grad_fn=<AddBackward0>)
I’ve tried a lot of things,change the learning rate, the loss, optimizer, add a sigmoid layer, but I always ended with the same results for all the samples. Any insights on why is this happening? Any help is welcome. Thank you