Loss is always starting with certain value and not changing

I am working on creating simple multi-class classifier using CAER dataset which classifies images into 7 basic emotion categories. Instead of sending entire image I am just passing facial image which is detected using face detector and then later on cropped and sent to network. For network initially I created my own model by using 5 convolutional layers which is then trained using SGD optimizer with learning rate as 1e-3.While training it is observed that loss is decreasing with epoch but no matter what it is always starting with 1.9**. I tried to use pretrained model where I froze some of the layers and tried training it but still it always starts with 1.9*** value. I also did augmentation and standardised the values but still no change. I don’t know where I am going wrong. Please suggest me any further changes that I need to do or I am going wrong somewhere.

Below is the code for custom dataset and model.
For pretrained model, I tried with alexnet,vgg16,resnet18.

class CAERDataset(Dataset):
    def __init__(self, root_dir, transform=None):

        self.root_dir = root_dir
        self.transform = transform
        #self.weight_path = "./dsfd/weights/WIDERFace_DSFD_RES152.pth"
        #self.detector = DSFDDetector(self.weight_path)  
        self.detector = DSFDDetector()
        self.image_list = glob.glob(self.root_dir+'/*/*png')

    def __len__(self):
        return len(self.image_list)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.to_list()

        img_path = self.image_list[idx]
        image = Image.open(img_path)
        img_label = float(img_path.split('/')[-2])
        im = np.array(image)

        x,y,w,h = 0,0,0,0
       detections = self.detector.detect_face(im, confidence_threshold=.5, shrink=1.0)

        if(len(detections)==0):
            face_image = image
        else:
            for i in range(len(detections)):
                x = detections[i][0]
                y = detections[i][1]
                w = detections[i][2]
                h = detections[i][3]
                break
            face_image = image.crop((x,y,w,h))
        label = np.array(img_label)

        if self.transform:
            face_image = self.transform(face_image)

        return (face_image, label)
                             
class Model(nn.Module):
  def __init__(self):
    super(Model, self).__init__()

    self.features = nn.Sequential(
                nn.Conv2d(3, 32, kernel_size=3, stride =1),
                nn.ReLU(),
                nn.BatchNorm2d(32),
                nn.Conv2d(32, 64, kernel_size=3, stride =1),
                nn.ReLU(),
                nn.BatchNorm2d(64),
                nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
                nn.Conv2d(64, 128, kernel_size=3, stride =1),
                nn.ReLU(),
                nn.BatchNorm2d(128),
                nn.MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False),
                nn.Conv2d(128, 256, kernel_size=3, stride =1),
                nn.ReLU(),
                nn.BatchNorm2d(256),
                nn.MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False),
                nn.Conv2d(256, 256, kernel_size=3, stride =1),
                nn.ReLU(),
                nn.BatchNorm2d(256),
                nn.MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False),
                #nn.Conv2d(256, 256, kernel_size=3, stride =1),
                #nn.ReLU(),
                #nn.BatchNorm2d(256),
                #nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
            )

    self.avgpool = nn.AdaptiveAvgPool2d(output_size=(5, 5))
    self.classifier = nn.Sequential(
                nn.BatchNorm1d(6400),
                nn.Dropout(0.5),
                nn.Linear(6400,2048),
                nn.ReLU(),
                nn.Dropout(0.25),
                nn.Linear(2048,512),
                nn.ReLU(),
                nn.Linear(512,7)
            )

  def forward(self,x):
      out = self.features(x)
      out = self.avgpool(out)
      #out = torch.flatten(out,1)
      out = out.view(out.size(0), -1)
      out = self.classifier(out)
      return out


main.py

transform = transforms.Compose([transforms.RandomResizedCrop(224),
                                transforms.RandomHorizontalFlip(),
                                #transforms.Grayscale(3),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
                            ])
test_transform = transforms.Compose([transforms.Resize([224,224]),
                                #transforms.Grayscale(3),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
                            ])

    train_data = CAERDataset(root_dir = './data/train', transform = transform)

    num_train = len(train_data)
    indices = list(range(num_train))
    split = int(np.floor(0.2*num_train))

    np.random.shuffle(indices)

    train_idx, val_idx = indices[split:], indices[:split]
    train_sampler = SubsetRandomSampler(train_idx)
    val_sampler  = SubsetRandomSampler(val_idx)

    train_loader = torch.utils.data.DataLoader(train_data,sampler=train_sampler,batch_size=32)
    val_loader = torch.utils.data.DataLoader(train_data,sampler=val_sampler,batch_size=32)

    test_data = CAERDataset(root_dir = './data/test', transform = test_transform)
    test_loader = torch.utils.data.DataLoader(test_data, batch_size=32, shuffle=True, num_workers=1)

optimizer = torch.optim.SGD(model.parameters(),lr=1e-3)

Can you also post the training code here?

criterion = nn.CrossEntropyLoss()
writer = SummaryWriter('runs/caer')

train_losses, val_losses = [], []
train_accs, val_accs = [], []

def train(args, dev, model, optimizer, dataloaders):
    train_loader, val_loader, test_loader = dataloaders
    for epoch in range(args.epochs):
        print(epoch)
        count = 0
        train_loss, val_loss = 0,0
        train_acc, val_acc = 0,0

        for (images,labels) in train_loader:
            #if (count == 6):
             #   break
            images = images.to(dev)
            labels = labels.to(dev)
            out_pred = model(images)
            labels = labels.type(torch.LongTensor).to(dev)

            loss = criterion(out_pred, labels)

            print(loss.item())

            train_loss += loss.item()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            count += 1

            total = labels.size(0)
            _, predicted = torch.max(out_pred.data, 1)
            correct = (predicted == labels).sum().item()
            train_acc += float(correct) / float(total)

        else:
            with torch.no_grad():
                model.eval()
                for (images,labels) in val_loader:
                    #labels = to_one_hot(labels)

                    images = images.to(dev)
                    labels = labels.to(dev)

                    val_out_pred = model(images)
                    labels = labels.type(torch.LongTensor).to(dev)

                    loss = criterion(val_out_pred, labels)
                    print(loss.item())
                    val_loss += loss.item()

                    total = labels.size(0)
                    _, predicted = torch.max(val_out_pred.data, 1)
                    correct = (predicted == labels).sum().item()
                    val_acc += float(correct) / float(total)

            model.train()

        print("Epoch: {}/{}.. ".format(epoch, args.epochs),
              "Training Loss: {:.3f}.. ".format(train_loss/len(train_loader)),
              "Validation Loss: {:.3f}.. ".format(val_loss/len(val_loader)),
              "Training Accuracy: {: .3f}.. ".format(train_acc/len(train_loader)*100),
              "Validation Accuracy: {:.3f}".format(val_acc/len(val_loader)*100))

        train_losses.append(train_loss/len(train_loader))
        val_losses.append(val_loss/len(val_loader))
        train_accs.append(train_acc/len(train_loader)*100)
        val_accs.append(val_acc/len(val_loader)*100)
        writer.add_scalar('Train/Loss', train_loss/len(train_loader), epoch)
        writer.add_scalar('Val/Loss', val_loss/len(val_loader), epoch)
        writer.flush()

    checkpoint = {'model': model,
              'state_dict': model.state_dict(),
              'optimizer' : optimizer.state_dict()}
    torch.save(checkpoint, 'caer_model.pth')
    evaluate(model, test_loader, args, dev)

def evaluate(model, test_loader, args, dev):
    for i, data in enumerate(test_loader):
        images, labels = data
        images, labels = images.to(dev), labels.to(dev)

        with torch.no_grad():
            out_pred = model(images)
            _, predicted = torch.max(out_pred.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print('Test Accuracy of the model on test images: {} %'.format(100 * correct / total))
    return