SSD's loss not decreasing

I am implementing SSD(Single shot detector) to study in PyTorch.
However, my custom training loss didn’t decrease…
I’ve searched and tried various solution for week, but problem is still remaining.
What should I do?
My loss function is incorrect?

Here is my SSD300 model

SSD300(
  (feature_layers): ModuleDict(
    (conv1_1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu1_1): ReLU()
    (conv1_2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu1_2): ReLU()
    (pool1): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (conv2_1): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu2_1): ReLU()
    (conv2_2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu2_2): ReLU()
    (pool2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (conv3_1): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu3_1): ReLU()
    (conv3_2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu3_2): ReLU()
    (conv3_3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu3_3): ReLU()
    (pool3): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=True)
    (conv4_1): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu4_1): ReLU()
    (conv4_2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu4_2): ReLU()
    (conv4_3): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu4_3): ReLU()
    (pool4): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (conv5_1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu5_1): ReLU()
    (conv5_2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu5_2): ReLU()
    (conv5_3): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu5_3): ReLU()
    (pool5): MaxPool2d(kernel_size=(3, 3), stride=(1, 1), padding=1, dilation=1, ceil_mode=False)
    (conv6): Conv2d(512, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(6, 6), dilation=(6, 6))
    (relu6): ReLU()
    (conv7): Conv2d(1024, 1024, kernel_size=(1, 1), stride=(1, 1))
    (relu7): ReLU()
    (conv8_1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
    (relu8_1): ReLU()
    (conv8_2): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (relu8_2): ReLU()
    (conv9_1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1))
    (relu9_1): ReLU()
    (conv9_2): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (relu9_2): ReLU()
    (conv10_1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))
    (relu10_1): ReLU()
    (conv10_2): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1))
    (relu10_2): ReLU()
    (conv11_1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))
    (relu11_1): ReLU()
    (conv11_2): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1))
    (relu11_2): ReLU()
  )
  (localization_layers): ModuleDict(
    (loc1): Sequential(
      (l2norm_loc1): L2Normalization()
      (conv_loc1): Conv2d(512, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (relu_loc1): ReLU()
    )
    (loc2): Sequential(
      (conv_loc2): Conv2d(1024, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (relu_loc2): ReLU()
    )
    (loc3): Sequential(
      (conv_loc3): Conv2d(512, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (relu_loc3): ReLU()
    )
    (loc4): Sequential(
      (conv_loc4): Conv2d(256, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (relu_loc4): ReLU()
    )
    (loc5): Sequential(
      (conv_loc5): Conv2d(256, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (relu_loc5): ReLU()
    )
    (loc6): Sequential(
      (conv_loc6): Conv2d(256, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (relu_loc6): ReLU()
    )
  )
  (confidence_layers): ModuleDict(
    (conf1): Sequential(
      (l2norm_conf1): L2Normalization()
      (conv_conf1): Conv2d(512, 84, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (relu_conf1): ReLU()
    )
    (conf2): Sequential(
      (conv_conf2): Conv2d(1024, 126, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (relu_conf2): ReLU()
    )
    (conf3): Sequential(
      (conv_conf3): Conv2d(512, 126, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (relu_conf3): ReLU()
    )
    (conf4): Sequential(
      (conv_conf4): Conv2d(256, 126, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (relu_conf4): ReLU()
    )
    (conf5): Sequential(
      (conv_conf5): Conv2d(256, 84, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (relu_conf5): ReLU()
    )
    (conf6): Sequential(
      (conv_conf6): Conv2d(256, 84, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (relu_conf6): ReLU()
    )
  )
  (predictor): Predictor()
)

My loss function is defined as;

class SSDLoss(nn.Module):
    def __init__(self, alpha=1, matching_func=None, loc_loss=None, conf_loss=None):
        super().__init__()

        self.alpha = alpha
        self.matching_strategy = matching_strategy if matching_func is None else matching_func
        self.loc_loss = LocalizationLoss() if loc_loss is None else loc_loss
        self.conf_loss = ConfidenceLoss() if conf_loss is None else conf_loss


    def forward(self, predicts, gts, dboxes):
        """
        :param predicts: Tensor, shape is (batch, total_dbox_nums, 4+class_nums=(cx, cy, w, h, p_class,...)
        :param gts: Tensor, shape is (batch*bbox_nums(batch), 1+4+class_nums) = [[img's_ind, cx, cy, w, h, p_class,...],..
        :param dboxes: Tensor, shape is (total_dbox_nums, 4=(cx,cy,w,h))
        :return:
            loss: float
        """
        # get predict's localization and confidence
        pred_loc, pred_conf = predicts[:, :, :4], predicts[:, :, 4:]

        # matching
        pos_indicator, gt_loc, gt_conf = self.matching_strategy(gts, dboxes, batch_num=predicts.shape[0], threshold=0.5)

        # calculate ground truth value considering default boxes
        gt_loc = gt_loc_converter(gt_loc, dboxes)

        # Localization loss
        loc_loss = self.loc_loss(pos_indicator, pred_loc, gt_loc)

        # Confidence loss
        conf_loss = self.conf_loss(pos_indicator, pred_conf, gt_conf)

        return conf_loss + self.alpha * loc_loss


class LocalizationLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.smoothL1Loss = nn.SmoothL1Loss(reduction='none')

    def forward(self, pos_indicator, predicts, gts):
        N = pos_indicator.sum()

        total_loss = self.smoothL1Loss(predicts, gts).sum(dim=-1) # shape = (batch num, dboxes num)
        loss = total_loss.masked_select(pos_indicator)

        return loss.sum() / N
        
class ConfidenceLoss(nn.Module):
    def __init__(self, neg_factor=3):
        """
        :param neg_factor: int, the ratio(1(pos): neg_factor) to learn pos and neg for hard negative mining
        """
        super().__init__()
        self.logsoftmax = nn.LogSoftmax(dim=-1)
        self._neg_factor = neg_factor

    def forward(self, pos_indicator, predicts, gts):

        loss = (-gts * self.logsoftmax(predicts)).sum(dim=-1) # shape = (batch num, dboxes num)

        N = pos_indicator.sum()
        neg_indicator = torch.logical_not(pos_indicator)

        pos_loss = loss.masked_select(pos_indicator)
        neg_loss = loss.masked_select(neg_indicator)

        neg_num = neg_loss.shape[0]
        neg_num = min(neg_num, self._neg_factor * N)

        _, topk_indices = torch.topk(neg_loss, neg_num)
        neg_loss = neg_loss.index_select(dim=0, index=topk_indices)

        return (pos_loss.sum() + neg_loss.sum()) / N

loss output is below;

Training... Epoch: 1, Iter: 1,	 [32/21503	 (0%)]	Loss: 28.804445
Training... Epoch: 1, Iter: 10,	 [320/21503	 (1%)]	Loss: 12.880742
Training... Epoch: 1, Iter: 20,	 [640/21503	 (3%)]	Loss: 15.932519
Training... Epoch: 1, Iter: 30,	 [960/21503	 (4%)]	Loss: 14.624641
Training... Epoch: 1, Iter: 40,	 [1280/21503	 (6%)]	Loss: 16.301014
Training... Epoch: 1, Iter: 50,	 [1600/21503	 (7%)]	Loss: 15.710087
Training... Epoch: 1, Iter: 60,	 [1920/21503	 (9%)]	Loss: 12.441727
Training... Epoch: 1, Iter: 70,	 [2240/21503	 (10%)]	Loss: 12.283393
Training... Epoch: 1, Iter: 80,	 [2560/21503	 (12%)]	Loss: 12.272835
Training... Epoch: 1, Iter: 90,	 [2880/21503	 (13%)]	Loss: 12.273635
Training... Epoch: 1, Iter: 100,	 [3200/21503	 (15%)]	Loss: 12.273409
Training... Epoch: 1, Iter: 110,	 [3520/21503	 (16%)]	Loss: 12.266172
Training... Epoch: 1, Iter: 120,	 [3840/21503	 (18%)]	Loss: 12.272820
Training... Epoch: 1, Iter: 130,	 [4160/21503	 (19%)]	Loss: 12.274920
Training... Epoch: 1, Iter: 140,	 [4480/21503	 (21%)]	Loss: 12.275247
Training... Epoch: 1, Iter: 150,	 [4800/21503	 (22%)]	Loss: 12.273258
Training... Epoch: 1, Iter: 160,	 [5120/21503	 (24%)]	Loss: 12.277486
Training... Epoch: 1, Iter: 170,	 [5440/21503	 (25%)]	Loss: 12.266512
Training... Epoch: 1, Iter: 180,	 [5760/21503	 (27%)]	Loss: 12.265674
Training... Epoch: 1, Iter: 190,	 [6080/21503	 (28%)]	Loss: 12.265306
Training... Epoch: 1, Iter: 200,	 [6400/21503	 (30%)]	Loss: 12.269717
Training... Epoch: 1, Iter: 210,	 [6720/21503	 (31%)]	Loss: 12.274122
Training... Epoch: 1, Iter: 220,	 [7040/21503	 (33%)]	Loss: 12.263970
Training... Epoch: 1, Iter: 230,	 [7360/21503	 (34%)]	Loss: 12.267252