Model parameters don't update during training and all .grads are None

Hey everyone, I’m trying to build a region proposal network with small a convolutional head and vgg16 as a backbone for feature extraction. I’m having an issue where the parameters are not being updated (currently fine tuning but will freeze the extractor later), and when I check gradients all of them are None. I keep getting dummy predictions and the loss isn’t decreasing. I’ve pretty much scoured this site, the docs, and google for answers, but I’m not sure. And debugs I might have missed? Thanks

class RegionProposalNetwork(nn.Module):

def init(self, in_channels=512, mid_channels=512, ratios=[1, 2, 4], anchor_scales=[8, 16, 32], stride=16, ):

  super(RegionProposalNetwork, self).__init__()

  n_anchor = len(anchor_scales)*len(ratios)

  #print(n_anchor)

  self.ratios = ratios

  self.anchor_scales = anchor_scales

  self.extractor = self.fe_init()

  self.conv1 = nn.Conv2d(in_channels, mid_channels, 3, 1, 1)

  self.reg_layer = nn.Conv2d(mid_channels, n_anchor *4, 1, 1, 0)

  self.cls_layer = nn.Conv2d(mid_channels, n_anchor *2, 1, 1, 0)

  # paper initialzes these layers with zero mean dn 0.01 standard deviation

  # conv sliding layer

  self.conv1.weight.data.normal_(0, 0.01)

  self.conv1.bias.data.zero_()

  # Regression layer

  self.reg_layer.weight.data.normal_(0, 0.01)

  self.reg_layer.bias.data.zero_()

  # classification layer

  self.cls_layer.weight.data.normal_(0, 0.01)

  self.cls_layer.bias.data.zero_()

  # Network Parameters

  #print(list(self.extractor.parameters()))

  #print(list(self.conv1.parameters()))

  #print(list(self.reg_layer.parameters()))

  #print(list(self.cls_layer.parameters()))

  #self.params = list(self.conv1.parameters()) + list(self.reg_layer.parameters()) + list(self.cls_layer.parameters()) #+ list(self.extractor.parameters())

def forward(self, image, bbox, img_size, scale=16):

  # load data onto cpu or cuda

  #tensor = image.to(device)

  #bbox = bbox.to(device)

  # Anchors Generation

  anchors = anchor_target_generator(ratios=self.ratios, scales=self.anchor_scales)

  ious = bbox_ious(bbox.cpu(), anchors)

  anchor_labels = assign_labels(ious, anchors, img_size)

  anchor_locations = bbox_to_relative(ious, anchors, bbox.cpu(), img_size)

  print('Anchor Generation Completed')



  # Turn Anchors and Labels into Tensor

  gt_rpn_loc = torch.from_numpy(anchor_locations)

  gt_rpn_score = torch.from_numpy(anchor_labels)

  print("Extracting...   ", end='')

  x = self.extractor(image)

  x = self.conv1(x)

  # prediction of object location with respect to the anchor and objectness scores

  pred_anchor_locs = self.reg_layer(x)

  pred_cls_scores = self.cls_layer(x)

  #print(pred_cls_scores, pred_anchor_locs)

  # Reformat anchor targets to match our anchor target sizes

  pred_anchor_locs = pred_anchor_locs.permute(0, 2, 3, 1).contiguous().view(1, -1, 4)

  pred_cls_scores = pred_cls_scores.permute(0, 2, 3, 1).contiguous()

  pred_cls_scores  = pred_cls_scores.view(1, -1, 2)

  rpn_loc = pred_anchor_locs[0]

  rpn_score = pred_cls_scores[0]

  #print(rpn_loc.shape)

  #print(rpn_score.shape)

  #print("loss calc")

  rpn_score = rpn_score.detach().requires_grad_(True)

  rpn_score = rpn_score.to(device)

  rpn_loc = rpn_loc.detach().requires_grad_(True)

  rpn_loc = rpn_loc.to(device)

  gt_rpn_score = gt_rpn_score.detach()#.requires_grad_(True)

  gt_rpn_score = gt_rpn_score.to(device)

  gt_rpn_loc = gt_rpn_loc.detach().requires_grad_(True)

  gt_rpn_loc = gt_rpn_loc.to(device)

  #rpn_loss = self.loss(rpn_loc, rpn_score, gt_rpn_loc, gt_rpn_score)

  return rpn_loc, rpn_score, gt_rpn_loc, gt_rpn_score

def predict(self, image):

  nms_thresh = 0.7

  n_train_pre_nms = 12000

  n_train_post_nms = 2000

  n_test_pre_nms = 6000

  n_test_post_nms = 300

  min_size = 16

  anchors = anchor_target_generator(ratios=self.ratios, scales=self.anchor_scales)

  #tensor = tr.to_tensor(image)

  #tensor = tensor.reshape(1, 3, 800, 800)

  tensor = image.to(device)

  x = self.extractor(tensor)

  x = self.conv1(x)

  # print("pred: extacted")

  # prediction of object location with respect to the anchor and objectness scores

  pred_anchor_locs = self.reg_layer(x)

  pred_cls_scores = self.cls_layer(x)

  #print("pred: prediction")

  #locs = [x for _,x in sorted(zip(pred_cls_scores, pred_anchor_locs))]

  #scores = sorted(pred_cls_scores)

  # Reformat anchor targets to match our anchor target sizes

  pred_cls_scores = pred_cls_scores.permute(0, 2, 3, 1).contiguous()

  #print(pred_anchor_locs.shape)

  # pred_cls_scores  = pred_cls_scores.view(1, -1, 2) <-- for softmax classification

  objectness_score = pred_cls_scores.view(1, 50, 50, len(self.ratios)*len(self.anchor_scales), 2)[:, :, :, :, 1].contiguous().view(1, -1)

  

  # convert predictions using the same formulas above

  objectness_score_numpy = objectness_score[0].cpu().data.numpy()

  roi = relative_to_bbox(anchors, pred_anchor_locs.cpu())

  roi, score = proposal_layer(roi, objectness_score_numpy)

  return roi, score

def fe_init(self):

  vgg16 = torchvision.models.vgg16(pretrained=True)

  layers = list(vgg16.features)

  new_layers = []

  for i in range(29):

    layer = layers[i]

    

    if isinstance(layer, nn.ReLU):

      

      layer.inplace = True

    #print(layer)

    new_layers.append(layer)

  #print(new_layers[0])

  extractor = nn.Sequential(*new_layers)

  for param in extractor.parameters():

    param.requires_grad = True

  return extractor

def rpn_loss(rpn_loc, rpn_score, gt_rpn_loc, gt_rpn_score):

  rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_score.long(), ignore_index = -1)

  print(rpn_cls_loss)

  pos = gt_rpn_score > 0

  mask = pos.unsqueeze(1).expand_as(rpn_loc)

  # extract bounding boxes from positive labels

  mask_loc_preds = rpn_loc[mask].view(-1, 4)

  mask_loc_targets = gt_rpn_loc[mask].view(-1, 4)

  x = torch.abs(mask_loc_targets - mask_loc_preds)

  rpn_loc_loss = ((x < 1).float() * 0.5 * x**2) + ((x >= 1).float() * (x-0.5))

  # apply loss

  x = torch.abs(mask_loc_targets - mask_loc_preds)

  rpn_loc_loss = ((x < 1).float() * 0.5 * x**2) + ((x >= 1).float() * (x-0.5))

  # Combine and apply our class loss, using a regularization parameter

  rpn_lambda = 10.

  N_reg = (gt_rpn_score >0).float().sum()

  rpn_loc_loss = rpn_loc_loss.sum() / N_reg

  rpn_loss = rpn_cls_loss + (rpn_lambda * rpn_loc_loss)

  return rpn_loss

epochs = 5

device = torch.device(“cuda:0” if torch.cuda.is_available() else “cpu”)

Assuming that we are on a CUDA machine, this should print a CUDA device:

rpn = RegionProposalNetwork()

rpn = rpn.to(device)

params = []

for name, param in rpn.named_parameters():

if param.requires_grad == True:

params.append(param)

print("\t", name)

lr = .0001

#params = list(rpn.parameters())

#print(list(rpn.parameters()))

optimizer = optim.Adam(rpn.parameters(), lr=lr)# momentum=0.9)

#torch.autograd.set_detect_anomaly(True)

print(list(rpn.parameters())[0].grad)

for i in idSALAMI[:epochs]:

print(“epoch:”, i, “loss:”, loss)

a = list(rpn.parameters())[0].clone()

i = 355

optimizer.zero_grad()

sample = salamiScaled[i]

rpn_loc, rpn_score, gt_rpn_loc, gt_rpn_score = rpn(image=sample[‘image’].to(device), bbox=sample[‘bboxes’].to(device), img_size=[800,800])

loss = rpn_loss(rpn_loc, rpn_score, gt_rpn_loc, gt_rpn_score)

loss.backward()

optimizer.step()

b = list(rpn.parameters())[0].clone()

print(torch.equal(a.data, b.data))

My training scheme and module are in the thread. Apologies if they’re unorganized, I’m new to this!

You are detaching some activations in the forward method in:

  rpn_score = rpn_score.detach().requires_grad_(True)

  rpn_score = rpn_score.to(device)

  rpn_loc = rpn_loc.detach().requires_grad_(True)

which will (as the name suggests :wink: ) detach these tensors from the computation graph and no gradients will be calculated for all previously used layers.

1 Like

Yep, this was it. Thanks!

Thanks for the information. It is really helpful for us. Please keep sharing this type of post.

Alaska Employee PET Login