I’m encountering a strange error during backprop when using GE2E loss (contrastive), which also happens to occur randomly, and the probability seems to increase with number of speakers passed, and decrease with number of utterances passed… I’m 99.99% sure it’s the loss, because:
a) the parameters passed to it affect it as stated above
b) I’ve trained identical setup for a long time with no such error popping up when using my custom replacement loss instead of GE2E
Any help on how to go about figuring this out would be greatly appreciated - this is the most cryptic thing I’ve ever encountered…
I’m using:
-> pytorch 1.6.0
-> python 3.6.9
Custom loss:
def helper_loss(self, data):
num_spk, num_utt, num_fea = data.size()
labels = torch.unsqueeze(torch.unsqueeze(torch.eye(num_spk).to(self.device), 0), 0)
upsample = nn.Upsample(scale_factor=num_utt, mode='nearest')
labels = torch.squeeze(upsample(labels))
features = data.reshape((num_spk * num_utt, num_fea))
results = torch.mm(features, features.T)
return self.mse(labels, results)
GE2E code:
import torch
import torch.nn as nn
import torch.nn.functional as F
class GE2ELoss(nn.Module):
def __init__(self, init_w=10.0, init_b=-5.0, loss_method='softmax'):
'''
Implementation of the Generalized End-to-End loss defined in https://arxiv.org/abs/1710.10467 [1]
Accepts an input of size (N, M, D)
where N is the number of speakers in the batch,
M is the number of utterances per speaker,
and D is the dimensionality of the embedding vector (e.g. d-vector)
Args:
- init_w (float): defines the initial value of w in Equation (5) of [1]
- init_b (float): definies the initial value of b in Equation (5) of [1]
'''
super(GE2ELoss, self).__init__()
self.w = nn.Parameter(torch.tensor(init_w))
self.b = nn.Parameter(torch.tensor(init_b))
self.loss_method = loss_method
assert self.loss_method in ['softmax', 'contrast']
if self.loss_method == 'softmax':
self.embed_loss = self.embed_loss_softmax
if self.loss_method == 'contrast':
self.embed_loss = self.embed_loss_contrast
def calc_new_centroids(self, dvecs, centroids, spkr, utt):
'''
Calculates the new centroids excluding the reference utterance
'''
excl = torch.cat((dvecs[spkr,:utt], dvecs[spkr,utt+1:]))
excl = torch.mean(excl, 0)
new_centroids = []
for i, centroid in enumerate(centroids):
if i == spkr:
new_centroids.append(excl)
else:
new_centroids.append(centroid)
return torch.stack(new_centroids)
def calc_cosine_sim(self, dvecs, centroids):
'''
Make the cosine similarity matrix with dims (N,M,N)
'''
cos_sim_matrix = []
for spkr_idx, speaker in enumerate(dvecs):
cs_row = []
for utt_idx, utterance in enumerate(speaker):
new_centroids = self.calc_new_centroids(dvecs, centroids, spkr_idx, utt_idx)
# vector based cosine similarity for speed
cs_row.append(torch.clamp(torch.mm(utterance.unsqueeze(1).transpose(0,1), new_centroids.transpose(0,1)) / (torch.norm(utterance) * torch.norm(new_centroids, dim=1)), 1e-6))
cs_row = torch.cat(cs_row, dim=0)
cos_sim_matrix.append(cs_row)
return torch.stack(cos_sim_matrix)
def embed_loss_softmax(self, dvecs, cos_sim_matrix):
'''
Calculates the loss on each embedding $L(e_{ji})$ by taking softmax
'''
N, M, _ = dvecs.shape
L = []
for j in range(N):
L_row = []
for i in range(M):
L_row.append(-F.log_softmax(cos_sim_matrix[j,i], 0)[j])
L_row = torch.stack(L_row)
L.append(L_row)
return torch.stack(L)
def embed_loss_contrast(self, dvecs, cos_sim_matrix):
'''
Calculates the loss on each embedding $L(e_{ji})$ by contrast loss with closest centroid
'''
N, M, _ = dvecs.shape
L = []
for j in range(N):
L_row = []
for i in range(M):
centroids_sigmoids = torch.sigmoid(cos_sim_matrix[j,i])
excl_centroids_sigmoids = torch.cat((centroids_sigmoids[:j], centroids_sigmoids[j+1:]))
L_row.append(1. - torch.sigmoid(cos_sim_matrix[j,i,j]) + torch.max(excl_centroids_sigmoids))
L_row = torch.stack(L_row)
L.append(L_row)
return torch.stack(L)
def forward(self, dvecs):
'''
Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
'''
#Calculate centroids
centroids = torch.mean(dvecs, 1)
#Calculate the cosine similarity matrix
cos_sim_matrix = self.calc_cosine_sim(dvecs, centroids)
torch.clamp(self.w, 1e-6)
cos_sim_matrix = cos_sim_matrix * self.w + self.b
L = self.embed_loss(dvecs, cos_sim_matrix)
return L.mean()
Stack trace:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-4-d2aadca10c7e> in <module>
33
34 for step in range((120000 * num_epochs) + 1):
---> 35 trainer.train_step(step)
36
37 # output = np.zeros((0, 512)).astype(np.float32) # 512
/jupyter_lab/ge2e_efnet_test/trainers/features_distance.py in train_step(self, step)
103 # print('GE2E loss: {}'.format(self.custom_loss(labels_reshaped).item()))
104
--> 105 self.each_student_end(i, loss)
106
107 self.base_iteration_end(step)
/jupyter_lab/ge2e_efnet_test/trainers/base.py in each_student_end(self, index, loss)
35 def each_student_end(self, index, loss):
36 self.losses.append(loss.item())
---> 37 loss.backward(retain_graph=True)
38 torch.nn.utils.clip_grad_norm_(self.students[index].parameters(), 1.0)
39 self.optimizers[index].step()
/usr/local/lib/python3.6/dist-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
183 products. Defaults to ``False``.
184 """
--> 185 torch.autograd.backward(self, gradient, retain_graph, create_graph)
186
187 def register_hook(self, hook):
/usr/local/lib/python3.6/dist-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
125 Variable._execution_engine.run_backward(
126 tensors, grad_tensors, retain_graph, create_graph,
--> 127 allow_unreachable=True) # allow_unreachable flag
128
129
RuntimeError: select(): index 0 out of range for tensor of size [0, 1] at dimension 0
Exception raised from select at /pytorch/aten/src/ATen/native/TensorShape.cpp:889 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7f3c829aa1e2 in /usr/local/lib/python3.6/dist-packages/torch/lib/libc10.so)
frame #1: at::native::select(at::Tensor const&, long, long) + 0x347 (0x7f3cbe8b6b97 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #2: <unknown function> + 0x1288329 (0x7f3cbec9b329 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #3: <unknown function> + 0x127b623 (0x7f3cbec8e623 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #4: at::select(at::Tensor const&, long, long) + 0xe0 (0x7f3cbebc0c90 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #5: <unknown function> + 0x2e06d26 (0x7f3cc0819d26 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #6: <unknown function> + 0x127b623 (0x7f3cbec8e623 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #7: at::Tensor::select(long, long) const + 0xe0 (0x7f3cbed4bde0 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #8: <unknown function> + 0x2d1223d (0x7f3cc072523d in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #9: torch::autograd::generated::MaxBackward1::apply(std::vector<at::Tensor, std::allocator<at::Tensor> >&&) + 0x188 (0x7f3cc073ec78 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #10: <unknown function> + 0x3375bb7 (0x7f3cc0d88bb7 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #11: torch::autograd::Engine::evaluate_function(std::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffer&, std::shared_ptr<torch::autograd::ReadyQueue> const&) + 0x1400 (0x7f3cc0d84400 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #12: torch::autograd::Engine::thread_main(std::shared_ptr<torch::autograd::GraphTask> const&) + 0x451 (0x7f3cc0d84fa1 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #13: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x89 (0x7f3cc0d7d119 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #14: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x4a (0x7f3cce51d4ba in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_python.so)
frame #15: <unknown function> + 0xbd66f (0x7f3d0e2bc66f in /usr/lib/x86_64-linux-gnu/libstdc++.so.6)
frame #16: <unknown function> + 0x76db (0x7f3d13cdc6db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #17: clone + 0x3f (0x7f3d1401588f in /lib/x86_64-linux-gnu/libc.so.6)