Multilabel classification, loss isn't decreasing and output are almost same

I have been trying to do Multilabel classification for 19 different classes. I used custom image data loader.
I used Densenet201 as backbone for inference. The loss function I used is BCEWithLogitsLoss. Below is my all relevant code.

Edit: I almost forgot, I had a dataframe which had ID as one column and labels in the other. I one hot encoded the data so that I could get a sparse matrix corresponding to every image.

Image Data Loader

class ImageDataLoader(Dataset):
    def __init__(self, dframe, transform = None):
        self.df = dframe
        self.transform = transform       
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img = Image.open(self.df['ID'][idx])
        img = np.asarray(img)
        img = img/255
        if self.transform:
            img = self.transform(img)
        label = torch.FloatTensor(df.drop(columns = ['ID']).loc[idx])
        sample = {'img': img, 'label': label}
        return sample

tfms = transforms.Compose([transforms.ToTensor(),
                           transforms.Normalize([0.07237246, 0.04476176, 0.07661699], [0.17179589, 0.10284516, 0.14199627]),
                           transforms.Resize((224, 224)),
                           transforms.RandomHorizontalFlip(),
                           transforms.RandomVerticalFlip(10)])

train_loader = ImageDataLoader(train_df, transform = tfms)#1000 images
train = DataLoader(train_loader, batch_size = 16, shuffle = False)

Inference Model

np.random.seed(1)
torch.manual_seed(1)

model = models.densenet201(pretrained = True)#Downloads model with pretrained config
for param in model.parameters():
    param.requires_grad = False

from collections import OrderedDict
classifier = nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(1920, 1024)),
    ('relu', nn.ReLU()),
    ('fc2', nn.Linear(1024, 512)),
    ('relu', nn.ReLU()),
    ('fc3', nn.Linear(512, 19))
]))

model.classifier = classifier

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr = 3e-2)
#lamda = lambda epoch: 10**(epoch/2)
#scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda = lamda, last_epoch=-1, verbose = True)

model = model.cuda()

Training

epochs = 16
steps = 0

train_losses, val_losses, lrs = [], [], []

for e in tqdm(range(epochs)):
  #lrs.append(optimizer.param_groups[0]["lr"])
  for sample in train:
    images, labels = sample['img'].to(torch.float32).cuda(), sample['label'].cuda()
    optimizer.zero_grad()
    ps = model.forward(images)
    loss = criterion(ps, labels)
    loss.backward()
    optimizer.step()
    images, labels = images.cpu(), labels.cpu()
  else:
    with torch.no_grad():
      for sample in val:
        images, labels = sample['img'].to(torch.float32).cuda(), sample['label'].cuda()
        ps = model.forward(images)
        val_loss = criterion(ps, labels)
        top_p, top_class = ps.topk(1, dim = 1)
        #equals = top_class == labels.view(*top_class.shape)
        #accuracy = torch.mean(equals.type(torch.FloatTensor))
        images, labels = images.cpu(), labels.cpu()
    train_losses.append(loss)
    val_losses.append(val_loss)
    #accuracies.append(accuracy)
    #scheduler.step() 
    print('Epoch..{}/{}'.format(e + 1, epochs),
          'Training Loss..{:.3f}'.format(train_losses[e]),
          'Test Loss..{:.3f}'.format(val_losses[e]))
         #'Accuracy..{:.3f}'.format(accuracies[e]))
    
model = model.cpu()

##RESULTS


  6%|▋         | 1/16 [00:57<14:25, 57.69s/it]
Epoch..1/16 Training Loss..0.252 Test Loss..0.255
 12%|█▎        | 2/16 [01:40<11:27, 49.12s/it]
Epoch..2/16 Training Loss..0.238 Test Loss..0.252
 19%|█▉        | 3/16 [02:24<10:05, 46.56s/it]
Epoch..3/16 Training Loss..0.241 Test Loss..0.246
 25%|██▌       | 4/16 [03:08<09:06, 45.57s/it]
Epoch..4/16 Training Loss..0.241 Test Loss..0.246
 31%|███▏      | 5/16 [03:53<08:18, 45.33s/it]
Epoch..5/16 Training Loss..0.240 Test Loss..0.248
 38%|███▊      | 6/16 [04:38<07:32, 45.21s/it]
Epoch..6/16 Training Loss..0.240 Test Loss..0.250
 44%|████▍     | 7/16 [05:22<06:44, 44.95s/it]
Epoch..7/16 Training Loss..0.239 Test Loss..0.252
 50%|█████     | 8/16 [06:07<05:58, 44.83s/it]
Epoch..8/16 Training Loss..0.239 Test Loss..0.253
 56%|█████▋    | 9/16 [06:52<05:15, 45.05s/it]
Epoch..9/16 Training Loss..0.238 Test Loss..0.255
 62%|██████▎   | 10/16 [07:36<04:28, 44.69s/it]
Epoch..10/16 Training Loss..0.238 Test Loss..0.256
 69%|██████▉   | 11/16 [08:20<03:42, 44.52s/it]
Epoch..11/16 Training Loss..0.238 Test Loss..0.257
 75%|███████▌  | 12/16 [09:05<02:57, 44.43s/it]
Epoch..12/16 Training Loss..0.238 Test Loss..0.257
 81%|████████▏ | 13/16 [09:48<02:12, 44.18s/it]
Epoch..13/16 Training Loss..0.238 Test Loss..0.258
 88%|████████▊ | 14/16 [10:32<01:27, 43.98s/it]
Epoch..14/16 Training Loss..0.238 Test Loss..0.258
 94%|█████████▍| 15/16 [11:15<00:43, 43.75s/it]
Epoch..15/16 Training Loss..0.238 Test Loss..0.258
100%|██████████| 16/16 [12:00<00:00, 45.00s/it]
Epoch..16/16 Training Loss..0.238 Test Loss..0.258

Code for prediction

with torch.no_grad():
    model.cpu()
    img = Image.open(df['ID'][9])
    img = tfms(img)
    img = img.reshape(1, 3, 224, 224)
    print(F.sigmoid(model.forward(img)))

tensor([[0.4760, 0.0375, 0.0830, 0.0524, 0.0395, 0.0595, 0.0410, 0.0753, 0.0423,
0.0370, 0.0197, 0.0011, 0.1263, 0.0783, 0.0797, 0.0187, 0.2127, 0.0200,
0.0035]]) #Output for image 9

tensor([[0.4760, 0.0375, 0.0830, 0.0524, 0.0395, 0.0595, 0.0410, 0.0753, 0.0423,
0.0370, 0.0197, 0.0011, 0.1263, 0.0783, 0.0797, 0.0187, 0.2127, 0.0200,
0.0035]])#Output for image 4

Please help, thank you!

Have you tried different learning rates?

Yes, I used scheduler to find a good lr. But loss was (validation loss) always in between 0.2 and 0.3.