So, I’m trying to train a DistilBert model and the gradients are either not being calculated or stored, since, when I print the gradients within a training loop they are always zero. Consequently the weights are not being updated.
optimizer = optim.Adam(network.parameters(), lr=0.0001)
loss_fn = nn.CrossEntropyLoss().to(device)
train_loader = DataLoader(
small_train_dataset
,batch_size=16
,shuffle=True
#,num_workers=4
)
valid_loader = DataLoader(
small_val_dataset
,batch_size=16
,shuffle=True
#,num_workers=4
)
train_losses = []
val_losses = []
network.train()
for i, batch in enumerate(train_loader): # Get Batch
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
optimizer.zero_grad()
outputs = network(input_ids, attention_mask=attention_mask) # Pass Batch
loss = loss_fn(outputs, labels) # Calculate Loss
loss.backward() # Calculate Gradients
for name, param in network.named_parameters():
if (i == 0 or i == 6):
if param.grad is not None:
print('GRADIENTS')
print(param.grad.data)
print('')
optimizer.step() # Update Weights
for name, param in network.named_parameters():
if (i == 0 or i == 6):
if 'distilbert.transformer.layer.5.ffn.lin2.weight' in name:
print('WEIGHTS')
print(f'BATCH - {i} - TRAIN-- {name}: {param}')
print('')
else: continue
train_losses.append(loss.detach().cpu().numpy())
Here’s my network
class Classifier(nn.Module):
def __init__(self, n_classes):
super(Classifier, self).__init__()
self.distilbert = DistilBertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
self.pre_classifier = nn.Linear(self.distilbert.config.dim, self.distilbert.config.dim)
self.classifier = nn.Linear(self.distilbert.config.dim, n_classes)
self.dropout = nn.Dropout(0.1)
def forward(self, input_ids, attention_mask):
distilbert_output = self.distilbert(
input_ids=input_ids,
attention_mask=attention_mask
)
hidden_state = distilbert_output[0] # (bs, seq_len, dim)
pooled_output = hidden_state[:, 0] # (bs, dim)
pooled_output = self.pre_classifier(pooled_output) # (bs, dim)
pooled_output = nn.ReLU()(pooled_output) # (bs, dim)
pooled_output = self.dropout(pooled_output) # (bs, dim)
logits = self.classifier(pooled_output) # (bs, num_labels)
s_max = F.softmax(logits, dim=1)
return s_max