Nan with normalized embeddings

I am trying to train a siamese network for embedding generation, for purposes of speaker identification.
For some reason, at a certain point of training, the network starts to output nan.
My intuition tells me that it is something related with the normalization I’m applying to the embeddings.

I will post the code of my network model and loss (online triplet loss). Any advice is helpful, thank you very much.

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.in_size = (1, 13, 512)
        self.features = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),                    #(N, 64, 13, 512)               
            nn.PReLU(),
            nn.MaxPool2d(kernel_size=(1,2), stride=(1,2), padding=0, dilation=1, ceil_mode=False),  #(N, 64, 13, 256)

            nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),                  #(N, 128, 13, 256)
            nn.PReLU(),
            nn.MaxPool2d(kernel_size=(1,2), stride=(1,2), padding=0, dilation=1, ceil_mode=False),  #(N, 128, 13, 128)

            nn.Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),                 #(N, 256, 13, 128)
            nn.PReLU(),
            nn.MaxPool2d(kernel_size=(1,2), stride=(1,2), padding=0, dilation=1, ceil_mode=False),  #(N, 256, 13, 64)

            nn.Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),                 #(N, 512, 13, 64)
            nn.PReLU(),
            nn.MaxPool2d(kernel_size=(1,2), stride=(1,2), padding=0, dilation=1, ceil_mode=False),  #(N, 512, 13, 32)
        )
        self.avgpool = nn.AdaptiveAvgPool2d(output_size=(4, 4))                                     # (N, 512, 4, 4)
        self.fc = nn.Sequential(
            nn.Linear(in_features=512*4*4, out_features=512, bias=True),
            nn.PReLU(),
            nn.Linear(in_features=512, out_features=256, bias=True),
            nn.PReLU(),
            nn.Linear(in_features=256, out_features=128, bias=True)
        )

    def forward(self, audio):
        """
        Forward propagation.
        :param audio: audio, a tensor of dimensions (N, 1, 13, 512)
        :return: computed features, a tensor of dimensions (N, 128)
        """
        out = self.features(audio)   # (N, 512, 4, 7)
        out = self.avgpool(out)      # (N, 512, 4, 4)
        out = out.view(-1, 512*4*4)  # (N, 512*4*4)=8192
        out = self.fc(out)           # (N, 128)
        out = F.normalize(out, p=2, dim=1)  # normalizing the embeddings to fit a hypersphere of radius 1 
        return out
class OnlineTripletLoss(nn.Module):
    """
    Online Triplets loss
    Takes a batch of embeddings and corresponding labels.
    Triplets are generated using triplet_selector object that take embeddings and targets and return indices of
    triplets
    """

    def __init__(self, margin, triplet_selector):
        super(OnlineTripletLoss, self).__init__()
        self.margin = margin
        self.triplet_selector = triplet_selector

    def forward(self, embeddings, target):

        triplets = self.triplet_selector.get_triplets(embeddings, target)

        if embeddings.is_cuda:
            triplets = triplets.cuda()

        ap_distances = (embeddings[triplets[:, 0]] - embeddings[triplets[:, 1]]).pow(2).sum(1).pow(.5)
        an_distances = (embeddings[triplets[:, 0]] - embeddings[triplets[:, 2]]).pow(2).sum(1).pow(.5)
        losses = F.relu(ap_distances - an_distances + self.margin)

        return losses.mean(), len(triplets)
1 Like

You could try to run your code with torch.autograd.detect_anomaly and check, if a backward pass fails.
If you cannot isolate the issue and think that the forward pass could create invalid outputs, you could use forward hooks to check the outputs of all layers and narrow down the issue.