Loss decrease to slow

Hello everyone. I’m trying to train loaded pre-trained resnet18 using arcface loss. I have a dataset of about 1,300,000 photos of 10171 people. One epoch takes about an hour to train. Loss starts from about 40, by the end of the epoch it decreases to 20. Over the next 2 epochs, it decreases to 16-17, and this is where the decrease in loss ends, or slows down very much. I tried using the StepLR scheduler every 1000 batches multiplying the learning rate by 0.9 and setting the threshold to 3e-4. This is my first time trying to train a model using ArcFace loss and I don’t know all the nuances yet. Tell me, please, how can I fix the problem and what could be wrong? Following is my code:

This is my model:

class ArcFaceHead(nn.Module):
    def __init__(self, num_classes, device=device, embedding_size=16, margin=0.5, scale=30):
        super(ArcFaceHead, self).__init__()
        self.num_classes = num_classes
        self.embedding_size = embedding_size
        self.margin = margin
        self.scale = scale
        self.cos_m = math.cos(margin)
        self.sin_m = math.sin(margin)
        self.threshold = math.cos(math.pi - margin)
        self.mm = self.sin_m * margin
        self.weight = nn.Parameter(torch.FloatTensor(num_classes, embedding_size))

    def forward(self, x, labels):
        cosine = F.linear(F.normalize(x), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        phi = torch.where(cosine > self.threshold, phi, cosine - self.mm)
        one_hot = torch.zeros(cosine.size(), device=x.device)
        one_hot.scatter_(1, labels.view(-1, 1).long(), 1)
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.scale
        return output
class ResNet18ArcFace(nn.Module):
    def __init__(self, num_classes, embedding_size=16):
        super(ResNet18ArcFace, self).__init__()
        self.embedding_size = embedding_size
        self.backbone = models.resnet18(pretrained=True)
        self.backbone.fc = nn.Sequential(
            nn.Linear(512, self.embedding_size),
        self.head = ArcFaceHead(num_classes, embedding_size=self.embedding_size)

    def forward(self, x, labels):
        x = self.backbone(x)
        x = self.head(x, labels)
        return x

My train function:

def train(model, criterion, device, train_loader, optimizer, epoch, best_loss, scheduler, saved=0):
    progress_bar = tqdm(total=len(train_loader.dataset), dynamic_ncols=True, leave=False) #, dynamic_ncols=True)
    for batch_idx, (data, labels) in enumerate(train_loader):
        data, labels = data.to(device), labels.to(device)
        output = model(data, labels)
        loss = criterion(output, labels)
        l_rate = get_lr(optimizer)

        progress_bar.set_description(f'Epoch: {epoch} Loss: {loss.item():.6f} LR: {l_rate:.7f} Saved: {saved}')
        if (batch_idx + 1) % LOG_INTERVAL == 0:
            saved += 1
            if l_rate > 3e-4:
            best_loss = loss
            epoch = epoch
            path = f"/content/drive/My Drive/model_checkpoint/note_arc_checkpoint_loss.pth"

                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': best_loss,
                }, path)

            best_loss = loss

And my “settings”:

NUM_OF_CLASSES = len(train_dataset.classes)
LR = 0.01 # initial learning rate
W_DECAY = 0.95

model = ResNet18ArcFace(NUM_OF_CLASSES)

criterion = torch.nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=W_DECAY)

scheduler = StepLR(optimizer, step_size=LR_STEP, gamma=0.9)

Edit: I just noticed that in the first epoch, the loss smoothly drops to 13, and then sharply increases to 16-17 and remains in this range throughout the rest of the epochs