How to check predictions of MaskRCNN in training phase

Hi, I have a problem with my MaskRCNN training process. The loss in training mode is decreasing well, but the validation metrics are always the same (since first to last epoch) and look very bad (mostly only one class is predicted). I wanted to take a look how prediction looks during training but I don’t know how to get access to them, because output = model(x, y) returns only losses. Do you know how to take it?
I have also another question - do you know why validation doesn’t work? Why predictions are always the same during single training process?
I should also add that I added another head to the backbone and that’s actually what I wish to train (it’s age of people on the picture).

training function:

def train(train_loader, model, optimizer, epoch, device):

model.train()
loss_monitor = AverageMeter()

lr_scheduler = None
if epoch == 0:
    warmup_factor = 1. / 1000
    warmup_iters = min(1000, len(train_loader) - 1)
    lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)

with tqdm(train_loader) as _tqdm:
    for x, y in _tqdm:
        x = x.to(device)
        for key, value in y.items():
            y[key] = torch.tensor(value).to(device)

        y_list = []
        for i in range(0, len(x)):
            y_list.append(y)
        outputs = model(x, y_list)
        print(outputs)
        # calc loss
        cur_loss = outputs["loss_age"]

        # measure accuracy and record loss
        sample_num = x.size(0)
        loss_monitor.update(cur_loss, sample_num)

        # compute gradient and do step
        optimizer.zero_grad()
        (outputs["loss_age"]).backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

        _tqdm.set_postfix(
            OrderedDict(stage="train", epoch=epoch, loss=loss_monitor.avg),
        )

return loss_monitor.avg

validation function:

def validate(val_loader, model, epoch, device):

model.eval()
preds = []
gt = []
print("Validating function running...")
with torch.no_grad():
    with tqdm(val_loader) as _tqdm:
        for x, y in _tqdm:
            x = x.to(device)

            for key, value in y.items():
                y[key] = torch.tensor(value).to(device)
            gt.append(y["age"].cpu().numpy())

            outputs = model(x)
            print(outputs)
            for output in outputs:  # I just change format of predictions over here
                pred = F.softmax(output["age"], dim=-1).cpu().numpy()
                pred = (pred * np.arange(0, pred.size)).sum(axis=-1)
                preds.append(np.array([pred]))

            _tqdm.set_postfix(OrderedDict(stage="val", epoch=epoch),)

mae = calculate_mae(gt, preds)  # my own functions - works well
f1 = calculate_f1(gt, preds)
return mae, f1

main loop:>

model = PornRCNN.create_resnet_50()

model = model.to(device)
model.set_age_loss_fn(loss_age)

params = [p for p in model.parameters() if p.requires_grad]

optimizer = torch.optim.SGD(params, lr=0.0003,
                            momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=1, T_mult=2)
num_epoch = 100
checkpoint_dir = Path("checkpoints")

for epoch in range(start_epoch, num_epoch):
    train_loss = train(train_loader, model, optimizer, epoch, device)
    mae, f1 = validate(val_loader, model, epoch, device)

Anyone could help me?

Might be not related, but why do you zero-grad after you call forward?

Thank you for fast reply.

I did exactly like it’s shown in: https://github.com/pytorch/vision/blob/master/references/detection/engine.py (in train_one_epoch function):

    loss_dict = model(images, targets)

    losses = sum(loss for loss in loss_dict.values())

    ...

    optimizer.zero_grad()
    losses.backward()
    optimizer.step()