Can't converge with triplet loss

I am trying to train a network, using triplet margin loss, to perform speaker identification task.
My dataset consist in MFCC (1x128x248 images) features extracted from audio files.
The problem I’m facing is that the training loss is getting stuck at the margin value (1.0).
I’m using online triplet mining, from this repository https://github.com/adambielski/siamese-triplet.
I have read that it’s easier to train a triplet loss model if you use pre-trained weights from a classification model, so I also trained the network with softmax loss.
After some frustrating time, I started to suspect that the problem might be on my dataset, so now I’m testing the model on some simple artificial data.
Now I realize that I’m even not being able to make the network converge on this simple artificial data. Already testes different learning rates and triplet mining strategies (random hard and hardest negative).
So, I’m lost and need some help. I will post my code, if someone can give me some tips I would be grateful.

This is the model for classification:

self.features = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), # (16, 128, 248)
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False), # (16, 64, 124)
            nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), # (32, 64, 124)
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False), # (32, 32, 62)
            nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), # (64, 32, 62)
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False), # (64, 16, 31)
            nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), # (128, 16, 31)
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False), # (128, 8, 15)
            nn.Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), # (256, 8, 15)
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False), # (256, 4, 7)
            nn.Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), # (512, 4, 7)
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) # (512, 2, 3)
        )
        self.fcl = nn.Sequential(
            nn.Linear(in_features=(512*2*3), out_features=1211, bias=True), # (512*2*3)=3072
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5, inplace=False),
            nn.Linear(in_features=1211, out_features=10, bias=True)
        )

This is the model for embedding generation:

self.features = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), # (16, 128, 248)
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False), # (16, 64, 124)
            nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), # (32, 64, 124)
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False), # (32, 32, 62)
            nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), # (64, 32, 62)
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False), # (64, 16, 31)
            nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), # (128, 16, 31)
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False), # (128, 8, 15)
            nn.Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), # (256, 8, 15)
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False), # (256, 4, 7)
            nn.Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), # (512, 4, 7)
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) # (512, 2, 3)
        )
        self.fcl = nn.Sequential(
            nn.Linear(in_features=(512*2*3), out_features=1024, bias=True), # (512*2*3)=3072
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5, inplace=False),
            nn.Linear(in_features=1024, out_features=512, bias=True), # (512)
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5, inplace=False),
            nn.Linear(in_features=512, out_features=256, bias=True), # (256)
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5, inplace=False),
            nn.Linear(in_features=256, out_features=128, bias=True) # (128)
        )

This is how I have generated my artificial data. Ten classes, each with ten random samples:

for id in range(10):
        id_path = os.path.join(root_path, ("id"+(str(id).zfill(6))))
        if not os.path.isdir(id_path):
            os.mkdir(id_path)
        for x in range(10):
            x_path = os.path.join(id_path, str(x).zfill(6)+".npy")
            data = (np.random.rand(128, 248)*(id+1))
            data = data.astype(np.float32)
            # data = np.ones((128, 248), dtype=np.float32)
            data.tofile(x_path)
            # print(data)
            print("x_path = ", x_path)

Here is my training loop for the classification:

print("device = ", "cuda" if torch.cuda.is_available() else "cpu")
in_size = (128, 248)
trainDataset = SimpleDataSet(root_data_dir=root_data_path, samples_size=in_size)
trainLoader = torch.utils.data.DataLoader(trainDataset, batch_size=50, shuffle=True,
                                            num_workers=0, collate_fn=trainDataset.collate_fn)
margin = 1.0
net_classification = ClassificationNet().train().to(device)
criterion = nn.CrossEntropyLoss(reduction='mean')
lr1 = 0.0001
optimizer = optim.Adam(net_classification.parameters(), lr=lr1, weight_decay=0.0005)
starting_epoch = 0
for epoch in range(starting_epoch,1000):  # loop over the dataset multiple times
    print("Epoch %d started!"%(epoch+1))
    running_loss = 0.0
    epoch_loss = 0.0
    running_acc = 0.0
    epoch_acc = 0.0
    count = 0.0
    epoch_count = 0.0
    for i, (samples, ids) in enumerate(trainLoader):
        samples = samples.to(device)
        ids = ids.to(device)

        optimizer.zero_grad()
        
        out = net_classification(samples)
        loss = criterion(out ,ids)
        loss.backward()
        optimizer.step()
        acc = net_classification.calcAcc(preds=out, targets=ids)
        
        running_loss += loss.item()
        epoch_loss += loss.item()*samples.size(0)
        running_acc += acc
        epoch_acc += acc*samples.size(0)
        count += 1
        epoch_count += samples.size(0)
        if i % 2 == 1:
            print('[%d, %5d] loss: %.5f' %
                (epoch + 1, i + 1, running_loss / count))
            print("[%d, %5d] acc: %.5f" % 
                (epoch + 1, i+1, running_acc / count))
            running_loss = 0.0
            running_acc = 0.0
            count = 0.0
    print(("Epoch %d finished!") % (epoch+1))
    print("Epoch loss: ", epoch_loss / epoch_count)
    print("Epoch acc: ", epoch_acc / epoch_count)
    if epoch % 10 == 0:
        torch.save({
            'epoch': epoch,
            'model_state_dict': net_classification.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': epoch_loss / epoch_count,
            }, out_weights_dir+"voice_net5_"+str(epoch+1)+".pth")

print('Finished Training')

And this is my training loop for embedding generation:

print("device = ", "cuda" if torch.cuda.is_available() else "cpu")
trainDataset = BalancedDataset(root_data_path=root_data_path, k=4)
trainLoader = torch.utils.data.DataLoader(trainDataset, batch_size=5, shuffle=True,
                                            num_workers=0, collate_fn=trainDataset.collate_fn)
classification_net = ClassificationNet()
checkpoint = torch.load("./weights/class_net_101.pth")
classification_net.load_state_dict(checkpoint['model_state_dict'])

margin = 1.0
lr1 = 0.0001  
net = Net().train()
net.features.load_state_dict(classification_net.features.state_dict())
net = net.to(device)
del checkpoint
del classification_net
optimizer = optim.Adam(net.parameters(), lr=lr1, weight_decay=0.0005)
criterion = OnlineTripletLoss(margin, HardestNegativeTripletSelector(margin))
starting_epoch = 0
for epoch in range(starting_epoch,1000):  # loop over the dataset multiple times
    print("Epoch %d started!"%(epoch+1))
    running_loss = 0.0
    epoch_loss = 0.0
    count = 0.0
    epoch_count = 0.0
    for i, (x, ids) in enumerate(trainLoader):
        x = x.to(device)
        ids = ids.to(device)
        # zero the parameter gradients
        optimizer.zero_grad()
        # forward + backward + optimize
        features = net(x)
        
        loss, n_triplets = criterion(features, ids)
        assert n_triplets > 0
        loss.backward()
        optimizer.step()
        # print statistics
        running_loss += loss.item()*n_triplets
        epoch_loss += loss.item()*n_triplets
        count += n_triplets
        epoch_count += n_triplets
        if i % 2 == 1:
            print('[%d, %5d] loss: %.5f' %
                (epoch + 1, i + 1, running_loss / count))
            running_loss = 0.0
            count = 0.0
    print(("Epoch %d finished!") % (epoch+1))
    print("Epoch loss: ", epoch_loss / epoch_count)
    if epoch % 10 == 0:
        torch.save({
            'epoch': epoch,
            'model_state_dict': net.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': epoch_loss / epoch_count,
            }, out_weights_dir+"net4_"+str(epoch+1)+".pth")
print('Finished Training')