No update in loss. Multi-input model, pre-trained densenets, sigmoid and BCELoss

Background:
I’m trying to train on image pairs, which are views from different angles of the same object. I want to perform a binary classification task. I’m using 2 pretrained DenseNet-121, which are later connected by some FC layers and finally a sigmoid activation. This is clubbed with a BCELoss. Big pieces of code follow, I think Model code and training loop code should be helpful.

What I’ve tried/observed till now:
a) varying learning rate. from 1e-2 to 1e-4. No changes in loss even then, trained for 3 epochs for 1e-2, 1e-3, 1e-4.
b) outputs are always 0.5 from the network. The layers aren’t initialised to 0/1/(any constant) right? So is this the expected behavior?

Please help!

Model code:

class classifier(nn.Module):
    def __init__(self):
        super(classifier,self).__init__()

        def get_densenet_model():
            model = torch.hub.load('pytorch/vision:v0.6.0', 'densenet121', pretrained=True)
            modules=list(model.children())[:-1]
            model=nn.Sequential(*modules)
            return model

        self.frontal_model=get_densenet_model()
        self.lateral_model=get_densenet_model()
        LOG(2,"DenseNet121 models loaded")
        self.frontal_fc1=nn.Linear(1024*8*8,1024*8)
        self.lateral_fc1=nn.Linear(1024*8*8,1024*8)
        self.frontal_fc2=nn.Linear(1024*8,512)
        self.lateral_fc2=nn.Linear(1024*8,512)
        self.final_fc1=nn.Linear(1024,512)
        self.final_fc2=nn.Linear(512,128)
        self.final_fc3=nn.Linear(128,1)
        self.sigmoid=nn.Sigmoid()

    def forward(self, frontal_img, lateral_img):
        x=self.frontal_model(frontal_img)
        #x=self.frontal_model.features(frontal_img)
        y=self.lateral_model(lateral_img)
        #y=self.lateral_model.features(lateral_img)
        x=x.view(x.size(0),-1)
        y=y.view(y.size(0),-1)
        x=nn.functional.relu(self.frontal_fc1(x))
        x=nn.functional.relu(self.frontal_fc2(x))
        y=nn.functional.relu(self.lateral_fc1(y))
        y=nn.functional.relu(self.lateral_fc2(y))
        x=torch.cat((x,y),1)
        x=nn.functional.relu(self.final_fc1(x))
        x=nn.functional.relu(self.final_fc2(x))
        x=nn.functional.relu(self.final_fc3(x))
        x=self.sigmoid(x)
        return x.squeeze()

Relevant to data loading:

class MyDataset(Dataset):
    def __init__(self, image_dir, label_csv_path, frontal_lateral_path, transform=None, resize_=None):
        image_path_list=[]
        frontal_lateral_df=pd.read_csv(frontal_lateral_path)
        LOG(2, "read frontal-lateral csv completed")
        def get_full_path(file_):
            return os.path.join(image_dir,file_+".png")

        for i in range(len(frontal_lateral_df)):
            image_path_list.append( (get_full_path(frontal_lateral_df['frontal'][i]),get_full_path(frontal_lateral_df['lateral'][i]) ))
        
        label_df=pd.read_csv(label_csv_path)
        LOG(2, "read label csv completed")
        labels=label_df['normal']
        labels=list(np.array(labels).astype(float))
        self.data_paths = image_path_list
        self.labels = labels
        self.transform = transform
        self.resize = resize_
        LOG(2, "Dataset initialisation completed")
    def __getitem__(self, index):
        frontal = Image.open(self.data_paths[index][0])
        frontal.convert('RGB')
        if self.transform:
            frontal = self.transform(frontal)
        if self.resize:
            frontal = self.resize(frontal)

        lateral = Image.open(self.data_paths[index][1])
        lateral.convert('RGB')
        if self.transform:
            lateral = self.transform(lateral)
        if self.resize:
            frontal = self.resize(lateral)
        
        normal=self.labels[index]

        return frontal, lateral, normal

    def __len__(self):
        return len(self.data_paths)

def get_data():
    dataset = MyDataset(image_dir , labels_path, frontal_lateral_path, init_train_transform())#equalisation_stuff, resize_stuff)
    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(np.floor(VAL_TRAIN_SPLIT * dataset_size))
    if SHUFFLE_DATASET :
        np.random.seed(RANDOM_SEED)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # Creating PT data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)
    train_loader = DataLoader(dataset, batch_size=BATCH_SIZE, 
                                           sampler=train_sampler)
    validation_loader = DataLoader(dataset, batch_size=BATCH_SIZE,
                                                sampler=valid_sampler)
    LOG(2, "Dataloaders initialisation completed")
    return train_loader, validation_loader

class gray_2_rgb(object):
    def __call__(self,img):
        img = np.asanyarray(img)
        return skimage.color.gray2rgb(img)

def init_train_transform():
        transform = transforms.Compose([
            gray_2_rgb(),
            transforms.ToPILImage(),
            transforms.Resize((256,256)),
            # transforms.RandomCrop(224),
            # transforms.RandomHorizontalFlip(),
            transforms.ToTensor() ])
            # transforms.Normalize((0.485, 0.456, 0.406),
            #                      (0.229, 0.224, 0.225))]
            #)
        return transform

Training Loop:

def train(train_loader, validation_loader):
    model= classifier()
    if torch.cuda.is_available():
        model.cuda()
    model.train()
    criterion = nn.BCELoss().cuda()
    optimizer = optim.Adam(model.parameters(), lr=0.001)#, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
    for epoch in tqdm(range(NUM_EPOCHS)):
        running_loss=0.0
        #print('epoch: '+str(epoch))
        for data in train_loader:
            frontal_images, lateral_images, labels = data
            frontal_images, lateral_images, labels = frontal_images.to(device), lateral_images.to(device), labels.to(device)
            #frontal_images, lateral_images, labels = data
            optimizer.zero_grad()
            outputs = model(frontal_images, lateral_images)
            labels=labels.float().to(device)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print('Epoch: {} Train_Loss: {:.4f}'.format(epoch, running_loss))
        if(epoch % 5 == 0):
            correct=0.0
            total=0.0
            with torch.no_grad():
                val_loss=0
                for frontal_images, lateral_images, labels in validation_loader:
                    outputs = model(frontal_images, lateral_images)
                    labels=labels.float().to(device)
                    loss = criterion(outputs, labels)
                    val_loss += loss.item()
                    total+=len(labels)
                    correct+=num_correct(outputs,labels)

                print('Epoch: {} Val_Loss: {:.4f}'.format(epoch, val_loss))
                print('Epoch: {} Val_Acc: {:.4f}'.format(epoch, correct/total))