I am trying to train a siamese network for embedding generation, for purposes of speaker identification.
For some reason, at a certain point of training, the network starts to output nan.
My intuition tells me that it is something related with the normalization I’m applying to the embeddings.
I will post the code of my network model and loss (online triplet loss). Any advice is helpful, thank you very much.
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.in_size = (1, 13, 512)
self.features = nn.Sequential(
nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), #(N, 64, 13, 512)
nn.PReLU(),
nn.MaxPool2d(kernel_size=(1,2), stride=(1,2), padding=0, dilation=1, ceil_mode=False), #(N, 64, 13, 256)
nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), #(N, 128, 13, 256)
nn.PReLU(),
nn.MaxPool2d(kernel_size=(1,2), stride=(1,2), padding=0, dilation=1, ceil_mode=False), #(N, 128, 13, 128)
nn.Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), #(N, 256, 13, 128)
nn.PReLU(),
nn.MaxPool2d(kernel_size=(1,2), stride=(1,2), padding=0, dilation=1, ceil_mode=False), #(N, 256, 13, 64)
nn.Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), #(N, 512, 13, 64)
nn.PReLU(),
nn.MaxPool2d(kernel_size=(1,2), stride=(1,2), padding=0, dilation=1, ceil_mode=False), #(N, 512, 13, 32)
)
self.avgpool = nn.AdaptiveAvgPool2d(output_size=(4, 4)) # (N, 512, 4, 4)
self.fc = nn.Sequential(
nn.Linear(in_features=512*4*4, out_features=512, bias=True),
nn.PReLU(),
nn.Linear(in_features=512, out_features=256, bias=True),
nn.PReLU(),
nn.Linear(in_features=256, out_features=128, bias=True)
)
def forward(self, audio):
"""
Forward propagation.
:param audio: audio, a tensor of dimensions (N, 1, 13, 512)
:return: computed features, a tensor of dimensions (N, 128)
"""
out = self.features(audio) # (N, 512, 4, 7)
out = self.avgpool(out) # (N, 512, 4, 4)
out = out.view(-1, 512*4*4) # (N, 512*4*4)=8192
out = self.fc(out) # (N, 128)
out = F.normalize(out, p=2, dim=1) # normalizing the embeddings to fit a hypersphere of radius 1
return out
class OnlineTripletLoss(nn.Module):
"""
Online Triplets loss
Takes a batch of embeddings and corresponding labels.
Triplets are generated using triplet_selector object that take embeddings and targets and return indices of
triplets
"""
def __init__(self, margin, triplet_selector):
super(OnlineTripletLoss, self).__init__()
self.margin = margin
self.triplet_selector = triplet_selector
def forward(self, embeddings, target):
triplets = self.triplet_selector.get_triplets(embeddings, target)
if embeddings.is_cuda:
triplets = triplets.cuda()
ap_distances = (embeddings[triplets[:, 0]] - embeddings[triplets[:, 1]]).pow(2).sum(1).pow(.5)
an_distances = (embeddings[triplets[:, 0]] - embeddings[triplets[:, 2]]).pow(2).sum(1).pow(.5)
losses = F.relu(ap_distances - an_distances + self.margin)
return losses.mean(), len(triplets)