Randomly occurring backprop error in loss function

I’m encountering a strange error during backprop when using GE2E loss (contrastive), which also happens to occur randomly, and the probability seems to increase with number of speakers passed, and decrease with number of utterances passed… I’m 99.99% sure it’s the loss, because:
a) the parameters passed to it affect it as stated above
b) I’ve trained identical setup for a long time with no such error popping up when using my custom replacement loss instead of GE2E

Any help on how to go about figuring this out would be greatly appreciated - this is the most cryptic thing I’ve ever encountered…

I’m using:
-> pytorch 1.6.0
-> python 3.6.9

Custom loss:

def helper_loss(self, data):
        num_spk, num_utt, num_fea = data.size()
        labels = torch.unsqueeze(torch.unsqueeze(torch.eye(num_spk).to(self.device), 0), 0)
        upsample = nn.Upsample(scale_factor=num_utt, mode='nearest')
        labels = torch.squeeze(upsample(labels))
        features = data.reshape((num_spk * num_utt, num_fea))
        results = torch.mm(features, features.T)
        return self.mse(labels, results)

GE2E code:

import torch
import torch.nn as nn
import torch.nn.functional as F

class GE2ELoss(nn.Module):

    def __init__(self, init_w=10.0, init_b=-5.0, loss_method='softmax'):
        '''
        Implementation of the Generalized End-to-End loss defined in https://arxiv.org/abs/1710.10467 [1]

        Accepts an input of size (N, M, D)

            where N is the number of speakers in the batch,
            M is the number of utterances per speaker,
            and D is the dimensionality of the embedding vector (e.g. d-vector)

        Args:
            - init_w (float): defines the initial value of w in Equation (5) of [1]
            - init_b (float): definies the initial value of b in Equation (5) of [1]
        '''
        super(GE2ELoss, self).__init__()
        self.w = nn.Parameter(torch.tensor(init_w))
        self.b = nn.Parameter(torch.tensor(init_b))
        self.loss_method = loss_method

        assert self.loss_method in ['softmax', 'contrast']

        if self.loss_method == 'softmax':
            self.embed_loss = self.embed_loss_softmax
        if self.loss_method == 'contrast':
            self.embed_loss = self.embed_loss_contrast

    def calc_new_centroids(self, dvecs, centroids, spkr, utt):
        '''
        Calculates the new centroids excluding the reference utterance
        '''
        excl = torch.cat((dvecs[spkr,:utt], dvecs[spkr,utt+1:]))
        excl = torch.mean(excl, 0)
        new_centroids = []
        for i, centroid in enumerate(centroids):
            if i == spkr:
                new_centroids.append(excl)
            else:
                new_centroids.append(centroid)
        return torch.stack(new_centroids)

    def calc_cosine_sim(self, dvecs, centroids):
        '''
        Make the cosine similarity matrix with dims (N,M,N)
        '''
        cos_sim_matrix = []
        for spkr_idx, speaker in enumerate(dvecs):
            cs_row = []
            for utt_idx, utterance in enumerate(speaker):
                new_centroids = self.calc_new_centroids(dvecs, centroids, spkr_idx, utt_idx)
                # vector based cosine similarity for speed
                cs_row.append(torch.clamp(torch.mm(utterance.unsqueeze(1).transpose(0,1), new_centroids.transpose(0,1)) / (torch.norm(utterance) * torch.norm(new_centroids, dim=1)), 1e-6))
            cs_row = torch.cat(cs_row, dim=0)
            cos_sim_matrix.append(cs_row)
        return torch.stack(cos_sim_matrix)

    def embed_loss_softmax(self, dvecs, cos_sim_matrix):
        '''
        Calculates the loss on each embedding $L(e_{ji})$ by taking softmax
        '''
        N, M, _ = dvecs.shape
        L = []
        for j in range(N):
            L_row = []
            for i in range(M):
                L_row.append(-F.log_softmax(cos_sim_matrix[j,i], 0)[j])
            L_row = torch.stack(L_row)
            L.append(L_row)
        return torch.stack(L)

    def embed_loss_contrast(self, dvecs, cos_sim_matrix):
        ''' 
        Calculates the loss on each embedding $L(e_{ji})$ by contrast loss with closest centroid
        '''
        N, M, _ = dvecs.shape
        L = []
        for j in range(N):
            L_row = []
            for i in range(M):
                centroids_sigmoids = torch.sigmoid(cos_sim_matrix[j,i])
                excl_centroids_sigmoids = torch.cat((centroids_sigmoids[:j], centroids_sigmoids[j+1:]))
                L_row.append(1. - torch.sigmoid(cos_sim_matrix[j,i,j]) + torch.max(excl_centroids_sigmoids))
            L_row = torch.stack(L_row)
            L.append(L_row)
        return torch.stack(L)

    def forward(self, dvecs):
        '''
        Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
        '''
        #Calculate centroids
        centroids = torch.mean(dvecs, 1)

        #Calculate the cosine similarity matrix
        cos_sim_matrix = self.calc_cosine_sim(dvecs, centroids)
        torch.clamp(self.w, 1e-6)
        cos_sim_matrix = cos_sim_matrix * self.w + self.b
        L = self.embed_loss(dvecs, cos_sim_matrix)
        return L.mean()

Stack trace:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-4-d2aadca10c7e> in <module>
     33 
     34 for step in range((120000 * num_epochs) + 1):
---> 35     trainer.train_step(step)
     36 
     37 # output = np.zeros((0, 512)).astype(np.float32) # 512

/jupyter_lab/ge2e_efnet_test/trainers/features_distance.py in train_step(self, step)
    103             # print('GE2E loss: {}'.format(self.custom_loss(labels_reshaped).item()))
    104 
--> 105             self.each_student_end(i, loss)
    106 
    107         self.base_iteration_end(step)

/jupyter_lab/ge2e_efnet_test/trainers/base.py in each_student_end(self, index, loss)
     35     def each_student_end(self, index, loss):
     36         self.losses.append(loss.item())
---> 37         loss.backward(retain_graph=True)
     38         torch.nn.utils.clip_grad_norm_(self.students[index].parameters(), 1.0)
     39         self.optimizers[index].step()

/usr/local/lib/python3.6/dist-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
    183                 products. Defaults to ``False``.
    184         """
--> 185         torch.autograd.backward(self, gradient, retain_graph, create_graph)
    186 
    187     def register_hook(self, hook):

/usr/local/lib/python3.6/dist-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
    125     Variable._execution_engine.run_backward(
    126         tensors, grad_tensors, retain_graph, create_graph,
--> 127         allow_unreachable=True)  # allow_unreachable flag
    128 
    129 

RuntimeError: select(): index 0 out of range for tensor of size [0, 1] at dimension 0
Exception raised from select at /pytorch/aten/src/ATen/native/TensorShape.cpp:889 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7f3c829aa1e2 in /usr/local/lib/python3.6/dist-packages/torch/lib/libc10.so)
frame #1: at::native::select(at::Tensor const&, long, long) + 0x347 (0x7f3cbe8b6b97 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #2: <unknown function> + 0x1288329 (0x7f3cbec9b329 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #3: <unknown function> + 0x127b623 (0x7f3cbec8e623 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #4: at::select(at::Tensor const&, long, long) + 0xe0 (0x7f3cbebc0c90 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #5: <unknown function> + 0x2e06d26 (0x7f3cc0819d26 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #6: <unknown function> + 0x127b623 (0x7f3cbec8e623 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #7: at::Tensor::select(long, long) const + 0xe0 (0x7f3cbed4bde0 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #8: <unknown function> + 0x2d1223d (0x7f3cc072523d in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #9: torch::autograd::generated::MaxBackward1::apply(std::vector<at::Tensor, std::allocator<at::Tensor> >&&) + 0x188 (0x7f3cc073ec78 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #10: <unknown function> + 0x3375bb7 (0x7f3cc0d88bb7 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #11: torch::autograd::Engine::evaluate_function(std::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffer&, std::shared_ptr<torch::autograd::ReadyQueue> const&) + 0x1400 (0x7f3cc0d84400 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #12: torch::autograd::Engine::thread_main(std::shared_ptr<torch::autograd::GraphTask> const&) + 0x451 (0x7f3cc0d84fa1 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #13: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x89 (0x7f3cc0d7d119 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #14: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x4a (0x7f3cce51d4ba in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_python.so)
frame #15: <unknown function> + 0xbd66f (0x7f3d0e2bc66f in /usr/lib/x86_64-linux-gnu/libstdc++.so.6)
frame #16: <unknown function> + 0x76db (0x7f3d13cdc6db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #17: clone + 0x3f (0x7f3d1401588f in /lib/x86_64-linux-gnu/libc.so.6)

I think I sorted it - the model I was using had a dropout layer, which would occasionally make it so that all the utterances of a single speaker would get dropped, causing the error.

Sorry I didn’t provide the model, which ended up being the cause (well, when combined with GE2E). Hope this helps someone!