# Can't converge with triplet loss

I am trying to train a network, using triplet margin loss, to perform speaker identification task.
My dataset consist in MFCC (1x128x248 images) features extracted from audio files.
The problem I’m facing is that the training loss is getting stuck at the margin value (1.0).
I’m using online triplet mining, from this repository https://github.com/adambielski/siamese-triplet.
I have read that it’s easier to train a triplet loss model if you use pre-trained weights from a classification model, so I also trained the network with softmax loss.
After some frustrating time, I started to suspect that the problem might be on my dataset, so now I’m testing the model on some simple artificial data.
Now I realize that I’m even not being able to make the network converge on this simple artificial data. Already testes different learning rates and triplet mining strategies (random hard and hardest negative).
So, I’m lost and need some help. I will post my code, if someone can give me some tips I would be grateful.

This is the model for classification:

``````self.features = nn.Sequential(
nn.Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), # (16, 128, 248)
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False), # (16, 64, 124)
nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), # (32, 64, 124)
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False), # (32, 32, 62)
nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), # (64, 32, 62)
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False), # (64, 16, 31)
nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), # (128, 16, 31)
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False), # (128, 8, 15)
nn.Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), # (256, 8, 15)
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False), # (256, 4, 7)
nn.Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), # (512, 4, 7)
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) # (512, 2, 3)
)
self.fcl = nn.Sequential(
nn.Linear(in_features=(512*2*3), out_features=1211, bias=True), # (512*2*3)=3072
nn.ReLU(inplace=True),
nn.Dropout(p=0.5, inplace=False),
nn.Linear(in_features=1211, out_features=10, bias=True)
)
``````

This is the model for embedding generation:

``````self.features = nn.Sequential(
nn.Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), # (16, 128, 248)
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False), # (16, 64, 124)
nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), # (32, 64, 124)
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False), # (32, 32, 62)
nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), # (64, 32, 62)
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False), # (64, 16, 31)
nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), # (128, 16, 31)
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False), # (128, 8, 15)
nn.Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), # (256, 8, 15)
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False), # (256, 4, 7)
nn.Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), # (512, 4, 7)
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) # (512, 2, 3)
)
self.fcl = nn.Sequential(
nn.Linear(in_features=(512*2*3), out_features=1024, bias=True), # (512*2*3)=3072
nn.ReLU(inplace=True),
nn.Dropout(p=0.5, inplace=False),
nn.Linear(in_features=1024, out_features=512, bias=True), # (512)
nn.ReLU(inplace=True),
nn.Dropout(p=0.5, inplace=False),
nn.Linear(in_features=512, out_features=256, bias=True), # (256)
nn.ReLU(inplace=True),
nn.Dropout(p=0.5, inplace=False),
nn.Linear(in_features=256, out_features=128, bias=True) # (128)
)
``````

This is how I have generated my artificial data. Ten classes, each with ten random samples:

``````for id in range(10):
id_path = os.path.join(root_path, ("id"+(str(id).zfill(6))))
if not os.path.isdir(id_path):
os.mkdir(id_path)
for x in range(10):
x_path = os.path.join(id_path, str(x).zfill(6)+".npy")
data = (np.random.rand(128, 248)*(id+1))
data = data.astype(np.float32)
# data = np.ones((128, 248), dtype=np.float32)
data.tofile(x_path)
# print(data)
print("x_path = ", x_path)
``````

Here is my training loop for the classification:

``````print("device = ", "cuda" if torch.cuda.is_available() else "cpu")
in_size = (128, 248)
trainDataset = SimpleDataSet(root_data_dir=root_data_path, samples_size=in_size)
num_workers=0, collate_fn=trainDataset.collate_fn)
margin = 1.0
net_classification = ClassificationNet().train().to(device)
criterion = nn.CrossEntropyLoss(reduction='mean')
lr1 = 0.0001
starting_epoch = 0
for epoch in range(starting_epoch,1000):  # loop over the dataset multiple times
print("Epoch %d started!"%(epoch+1))
running_loss = 0.0
epoch_loss = 0.0
running_acc = 0.0
epoch_acc = 0.0
count = 0.0
epoch_count = 0.0
for i, (samples, ids) in enumerate(trainLoader):
samples = samples.to(device)
ids = ids.to(device)

out = net_classification(samples)
loss = criterion(out ,ids)
loss.backward()
optimizer.step()
acc = net_classification.calcAcc(preds=out, targets=ids)

running_loss += loss.item()
epoch_loss += loss.item()*samples.size(0)
running_acc += acc
epoch_acc += acc*samples.size(0)
count += 1
epoch_count += samples.size(0)
if i % 2 == 1:
print('[%d, %5d] loss: %.5f' %
(epoch + 1, i + 1, running_loss / count))
print("[%d, %5d] acc: %.5f" %
(epoch + 1, i+1, running_acc / count))
running_loss = 0.0
running_acc = 0.0
count = 0.0
print(("Epoch %d finished!") % (epoch+1))
print("Epoch loss: ", epoch_loss / epoch_count)
print("Epoch acc: ", epoch_acc / epoch_count)
if epoch % 10 == 0:
torch.save({
'epoch': epoch,
'model_state_dict': net_classification.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'loss': epoch_loss / epoch_count,
}, out_weights_dir+"voice_net5_"+str(epoch+1)+".pth")

print('Finished Training')
``````

And this is my training loop for embedding generation:

``````print("device = ", "cuda" if torch.cuda.is_available() else "cpu")
trainDataset = BalancedDataset(root_data_path=root_data_path, k=4)
num_workers=0, collate_fn=trainDataset.collate_fn)
classification_net = ClassificationNet()

margin = 1.0
lr1 = 0.0001
net = Net().train()
net = net.to(device)
del checkpoint
del classification_net
criterion = OnlineTripletLoss(margin, HardestNegativeTripletSelector(margin))
starting_epoch = 0
for epoch in range(starting_epoch,1000):  # loop over the dataset multiple times
print("Epoch %d started!"%(epoch+1))
running_loss = 0.0
epoch_loss = 0.0
count = 0.0
epoch_count = 0.0
for i, (x, ids) in enumerate(trainLoader):
x = x.to(device)
ids = ids.to(device)
# forward + backward + optimize
features = net(x)

loss, n_triplets = criterion(features, ids)
assert n_triplets > 0
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()*n_triplets
epoch_loss += loss.item()*n_triplets
count += n_triplets
epoch_count += n_triplets
if i % 2 == 1:
print('[%d, %5d] loss: %.5f' %
(epoch + 1, i + 1, running_loss / count))
running_loss = 0.0
count = 0.0
print(("Epoch %d finished!") % (epoch+1))
print("Epoch loss: ", epoch_loss / epoch_count)
if epoch % 10 == 0:
torch.save({
'epoch': epoch,
'model_state_dict': net.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'loss': epoch_loss / epoch_count,
}, out_weights_dir+"net4_"+str(epoch+1)+".pth")
print('Finished Training')
``````