Gradients are not being updated or stored

So, I’m trying to train a DistilBert model and the gradients are either not being calculated or stored, since, when I print the gradients within a training loop they are always zero. Consequently the weights are not being updated.

optimizer = optim.Adam(network.parameters(), lr=0.0001)
loss_fn = nn.CrossEntropyLoss().to(device)

train_loader = DataLoader(
    small_train_dataset
    ,batch_size=16
    ,shuffle=True
    #,num_workers=4
)

valid_loader = DataLoader(
    small_val_dataset
    ,batch_size=16
    ,shuffle=True
    #,num_workers=4
)

train_losses = []
val_losses = []

network.train()
for i, batch in enumerate(train_loader): # Get Batch
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    optimizer.zero_grad()
    outputs = network(input_ids, attention_mask=attention_mask) # Pass Batch

    loss = loss_fn(outputs, labels) # Calculate Loss
    loss.backward() # Calculate Gradients
    for name, param in network.named_parameters():
      if (i == 0 or i == 6):
            if param.grad is not None:
                print('GRADIENTS')
                print(param.grad.data)
                print('')
    optimizer.step() # Update Weights
    for name, param in network.named_parameters():
      if (i == 0 or i == 6):
            if 'distilbert.transformer.layer.5.ffn.lin2.weight' in name:
                print('WEIGHTS')
                print(f'BATCH - {i} - TRAIN-- {name}: {param}')
                print('')
      else: continue
    train_losses.append(loss.detach().cpu().numpy())

Here’s my network

class Classifier(nn.Module):
    
    def __init__(self, n_classes):
        super(Classifier, self).__init__()
       
        self.distilbert = DistilBertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.pre_classifier = nn.Linear(self.distilbert.config.dim, self.distilbert.config.dim)
        self.classifier = nn.Linear(self.distilbert.config.dim, n_classes)
        self.dropout = nn.Dropout(0.1)

    def forward(self, input_ids, attention_mask):
        distilbert_output  = self.distilbert(
        input_ids=input_ids,
        attention_mask=attention_mask
        )
        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
        pooled_output = hidden_state[:, 0]  # (bs, dim)
        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
        pooled_output = nn.ReLU()(pooled_output)  # (bs, dim)
        pooled_output = self.dropout(pooled_output)  # (bs, dim) 
        logits = self.classifier(pooled_output)  # (bs, num_labels)

        s_max = F.softmax(logits, dim=1)
        return s_max        

nn.CrossEntropyLoss expects raw logits as the model output, so remove the F.softmax in your forward method and check, if this would yield non-zero gradients.

1 Like

Thank you !! It did yield the non-zero gradients