Cannot find unused Parameters (DDP Training)

rdslater · April 22, 2025, 4:49pm

I get the following error

` RuntimeError: It looks like your LightningModule has parameters that were not used in producing the loss returned by training_step. If this is intentional, you must enable the detection of unused parameters in DDP, either by setting the string value `strategy='ddp_find_unused_parameters_true'` or by setting the flag in the strategy with `strategy=DDPStrategy(find_unused_parameters=True)`.`

So I can set the strategy to true and my model runs. HOWEVER, i was trying to understand why this cropped up in the first place. Based on my research I tried to find what was ‘unused’ in my forward pass. But everything came up blank:

for name, param in model.named_parameters():
...     if param.grad is None:
...         print(name)

Nothing returned. So I am at a loss as to why this is occuring. Here is my model (You can see me commenting out things to try and troubleshoot and yes the dimensions are weird, that is on purpose)

import lightning as pl
import segmentation_models_pytorch as smp
from monai.losses import DiceCELoss
from torchmetrics.segmentation import DiceScore
import torch
from torch.optim import lr_scheduler
from torch import nn

class EZBasicModel(pl.LightningModule):
    def __init__(self, arch, encoder_name, in_channels, out_classes, **kwargs):
        super().__init__()
        self.model = smp.create_model(
            arch,
            encoder_name=encoder_name,
            in_channels=in_channels,
            classes=out_classes,
            **kwargs,
        )

        # Loss function for multi-class segmentation
        self.loss = DiceCELoss(to_onehot_y=False, softmax=True)
        #self.score = DiceScore(num_classes=2,include_background=False)

    def forward(self, image):
        # Normalize image
        #image = (image - self.mean) / self.std
        mask = self.model(image)
        return mask

    def training_step(self, batch, batch_idx):
        inputImage, mask, inputName, maskName = batch
        batch_size = inputImage.shape[0]
        pred = self.model(inputImage)
        loss = self.loss(pred,mask)
        #classes = nn.Softmax(dim=1)(pred)
        #classes = torch.round(classes)
        #score = self.score(classes,mask)
        self.log('train_dice_loss', loss, on_epoch=True, prog_bar=True, logger=True,sync_dist=True, batch_size=batch_size)
        #self.log('train_dice',score, on_epoch=True, prog_bar=True, logger=True,sync_dist=True, batch_size=batch_size)
        return loss

    def validation_step(self, batch, batch_idx):
        inputImage, mask, inputName, maskName = batch
        batch_size = inputImage.shape[0]
        pred = self.model(inputImage)
        loss = self.loss(pred,mask)
        #classes = nn.Softmax(dim=1)(pred)
        #classes = torch.round(classes)
        #score = self.score(classes,mask)
        self.log('val_dice_loss', loss, on_epoch=True, prog_bar=True, logger=True,sync_dist=True, batch_size=batch_size)
        #self.log('val_dice',score, on_epoch=True, prog_bar=True, logger=True,sync_dist=True, batch_size=batch_size)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=2e-3)
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=50, eta_min=1e-5)
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "interval": "step",
                "frequency": 1,
            },
        }

run this with dummy data

import numpy as np
import torch
from model import EZBasicModel

image = np.random.random((1,496,512,96))
image = torch.tensor(image)
image = image.type(torch.float32)
mask = np.random.random((2,512,96))
mask = torch.tensor(mask)
mask = torch.round(mask)
model = EZBasicModel('FPN','resnet34',496,2)
preds = model.forward(image)
loss = model.loss(preds,mask.unsqueeze(dim=0))
loss.backward()
for name, param in model.named_parameters():
    if param.grad is None:
        print(name)
    else:
        print(f"grad applied to {name}")

Anyone have some guidance on what could be happening here? Im still new and learning to DDP.