Weights become NaN values after first batch step

I am training a model with conv1d on top of the tdnn layers, but when i see the values in conv_tdnn in TDNNbase forward fxn after the first batch is executed, weights seem fine. but from second batch, When I checked the kernels/weights which I created and registered as parameters, the weights actually become NaN. Actually for the first batch it works fine but after the optimization step i.e from second batch the weights (self.kernel and self.bias) becomes “Nan”
How do we fix this in pytorch

class TDNNBase(nn.Module):
    def __init__(self, in_channels, out_channels, context, full_context=True):     
        self.count = 0
        super(TDNNBase, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        # Check valid Context
        #self.context = context
        self.full_context = full_context
        self.is_context_valid(context)       
      
        """TDNN Layer  with Context = [-k,+k] or {-k, k}"""

        self.kernel_size, context = self.get_cont_ker_size(context, self.full_context)
        self.register_buffer("context", torch.LongTensor(context))
        self.kernel= nn.Parameter(torch.randn(self.out_channels, self.in_channels, self.kernel_size))
        self.bias  = nn.Parameter(torch.randn(self.out_channels)) 
        nn.init.xavier_uniform_(self.kernel)
        nn.init.xavier_uniform_(torch.unsqueeze(self.bias, 0))


        def forward(self, input):
          conv_tdnn = self.tdnnlayer(input, self.kernel, self.context, self.bias)
          return conv_tdnn

    """TDNN Convolution Layer"""
    def tdnn_layer(self, data_batch, kernel, context, bias):
        """
        Context wise Convolution: This is the time Delay Neural Network layer
        built on top of the 1D Convolution.Since Normal 1D Convolution is
        performed in the sequence, so in order to select the random context
        (i.e particular frames),we need to select only those particular Frames
        and perform the Convolution with only that particular Context (Only on those
        particular frames) """        

        input_shape = data_batch.size()        
        # Input Shape is batch size X Feature Dimension X Sequence(Frames) Length
        [batch_size, feature_dim, input_seq_length] = input_shape   
        num_frames_to_convolve = self.get_num_frames(context, input_seq_length)
        x_conv_tdnn = Variable(torch.zeros(batch_size, kernel.size()[0], len(num_frames_to_convolve)))  
        x_conv_tdnn[torch.isnan(x_conv_tdnn)] = 0           #Set Nan = 0      
        for i, step in enumerate(num_frames_to_convolve):
            features_to_convolve_indices = torch.index_select(data_batch, 2, context + step)  #To selecet features with context window using 
            x_conv_tdnn[:, :, i] = F.conv1d(features_to_convolve_indices, kernel,bias = bias, padding=1)[:,:,0]#.mean(dim=2)            
        return x_conv_tdnn   

class TDNN(nn.Module):
    def __init__(self):
        super(TimeDelayNN, self).__init__()
        self.tdnn1 = TDNNBase(13,512,[-2,2],full_context= True)  #in_channels = 40 is Feature Dimensions of Size 120 and
        self.bn1   = nn.BatchNorm1d(512)
        self.tdnn2 = TDNNBase(512, 64, [-1, 2],full_context=False) #out_channels = N Feature Maps/Kernels
        self.bn2   = nn.BatchNorm1d(64)
        self.conv1 = nn.Conv1d(64,16,1, padding=0)
        self.fc1   = nn.Linear(16,5)


    def forward(self, input):
        input = self.tdnn1(input)
        input = F.relu(input)
        input = self.bn1(input)
        input = self.tdnn2(input)
        input = F.relu(input)
        input = self.bn2(input)        
        input = self.conv1d1(input)
        input = F.relu(input)        
        input = self.fc1(input)    

        return input

#Training 
def train_fxn(**kwargs):    
    for i, (batch, label, _) in enumerate(self.train_loader):
        model.train()
        batch = batch.to(device)
        label = label.to(device)
        optim.zero_grad()                    #Zero out graidnets
        output = model(batch)                # Forward Pass
        loss = criterion(output, label)      #Loss
        loss.backward()
        #torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optim.step()
        _, predictions = torch.max(output.data, 1)        

        total += label.size(0)
        correct += (predictions == label).sum().item()
        running_train_loss += loss.item() * batch.size(0)
            
    train_avg_loss = running_train_loss / len(self.train_loader)
    return train_avg_loss, correct, total


#Output 
**After first batch at conv_tdnn (at return of TDNNbase forward function)**
tensor([[[ 0.0886,  0.0941,  0.0760,  ..., -0.1054, -0.1054, -0.1054],
         [-0.2494, -0.2688, -0.2743,  ..., -0.0443, -0.0443, -0.0443],
         [ 0.0373, -0.0574, -0.1424,  ..., -0.0450, -0.0450, -0.0450],
         ...,
         [ 0.1292,  0.0439, -0.0055,  ...,  0.0344,  0.0344,  0.0344],
         [ 0.1009,  0.1313,  0.0728,  ...,  0.0845,  0.0845,  0.0845],
         [-0.0864, -0.0101, -0.0266,  ...,  0.0706,  0.0706,  0.0706]],

        [[-0.0520, -0.0331, -0.0263,  ..., -0.1054, -0.1054, -0.1054],
         [-0.0542, -0.0904, -0.1329,  ..., -0.0443, -0.0443, -0.0443],
         [-0.2399, -0.2217, -0.1750,  ..., -0.0450, -0.0450, -0.0450],
         ...,
         [ 0.1098,  0.1261,  0.1243,  ...,  0.0344,  0.0344,  0.0344],
         [-0.0078,  0.0465,  0.0430,  ...,  0.0845,  0.0845,  0.0845],
         [-0.0522, -0.0356, -0.0696,  ...,  0.0706,  0.0706,  0.0706]],

        [[-0.0368, -0.0417, -0.0544,  ..., -0.1054, -0.1054, -0.1054],
         [-0.1158, -0.1055, -0.1214,  ..., -0.0443, -0.0443, -0.0443],
         [-0.2064, -0.1804, -0.1543,  ..., -0.0450, -0.0450, -0.0450],
         ...,
         [ 0.0432,  0.0606,  0.0588,  ...,  0.0344,  0.0344,  0.0344],
         [-0.0737, -0.0635, -0.0451,  ...,  0.0845,  0.0845,  0.0845],
         [-0.1350, -0.1560, -0.1374,  ...,  0.0706,  0.0706,  0.0706]],

        ...,

        [[-0.0406, -0.0759, -0.0934,  ...,  0.0270,  0.0437, -0.0089],
         [-0.0683, -0.0960, -0.1011,  ..., -0.4773, -0.4974, -0.4225],
         [-0.1967, -0.1860, -0.2016,  ...,  0.0212,  0.1038,  0.0512],
         ...,
         [ 0.0874,  0.0988,  0.0706,  ...,  0.1736,  0.1758,  0.1879],
         [-0.0339, -0.0344,  0.0466,  ..., -0.0173, -0.0020,  0.0231],
         [-0.0484, -0.0529,  0.0136,  ..., -0.3070, -0.3332, -0.3312]],

        [[ 0.0814,  0.0658,  0.0750,  ..., -0.0087,  0.0375,  0.0046],
         [-0.2866, -0.2804, -0.2798,  ..., -0.1776, -0.0918, -0.1062],
         [-0.3017, -0.2633, -0.2088,  ..., -0.1002, -0.1149, -0.1084],
         ...,
         [ 0.0134,  0.0475,  0.0628,  ...,  0.1704,  0.1373,  0.1426],
         [-0.0346, -0.0488, -0.0529,  ..., -0.0078, -0.1018, -0.1739],
         [-0.1197, -0.1306, -0.1403,  ..., -0.1600, -0.1617, -0.1827]],

        [[ 0.0168, -0.0307, -0.0600,  ...,  0.0956,  0.0794,  0.0750],
         [-0.1676, -0.1456, -0.1850,  ..., -0.2501, -0.2628, -0.2468],
         [-0.2551, -0.2276, -0.1515,  ..., -0.0386, -0.1205, -0.1908],
         ...,
         [ 0.0117,  0.0067,  0.0812,  ...,  0.1185,  0.1009,  0.1022],
         [-0.0080,  0.0480,  0.0536,  ..., -0.0798, -0.0375, -0.0067],
         [-0.0579, -0.0628, -0.1125,  ..., -0.1455, -0.1393, -0.1766]]],
       grad_fn=<CopySlices>)

***From second batch at conv_tdnn (at return of TDNNbase forward function) Output after optimisation step/***
tensor([[[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         ...,
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         ...,
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         ...,
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]],

        ...,

        [[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         ...,
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         ...,
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         ...,
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]]], grad_fn=<CopySlices>) 

@ptrblck This time I tried not to tag you but I didn’t receive any response, so i had to tag you again :wink:

Note: The weights become NaN after the model perform optim.step() operation

**Weights before *optim.step()***

tensor([[[ 0.0216, -0.0274,  0.0092,  0.0200,  0.0098],
         [ 0.0302,  0.0371,  0.0188,  0.0294,  0.0384],
         [ 0.0126, -0.0456, -0.0403, -0.0410,  0.0268],
         ...,
         [-0.0453, -0.0446,  0.0200,  0.0058,  0.0293],
         [-0.0332,  0.0131,  0.0335, -0.0257, -0.0236],
         [-0.0450, -0.0447,  0.0255,  0.0492, -0.0292]],

        [[-0.0161,  0.0003,  0.0291, -0.0007,  0.0112],
         [ 0.0287, -0.0169,  0.0059,  0.0356, -0.0112],
         [-0.0397, -0.0554, -0.0096,  0.0362,  0.0300],
         ...,
         [-0.0120, -0.0267, -0.0218, -0.0219, -0.0436],
         [ 0.0274, -0.0510, -0.0040, -0.0502,  0.0355],
         [ 0.0230, -0.0012, -0.0350,  0.0491,  0.0433]],

        [[-0.0398, -0.0122, -0.0506, -0.0266, -0.0049],
         [-0.0192, -0.0178, -0.0416, -0.0558,  0.0473],
         [ 0.0518,  0.0558,  0.0386, -0.0165,  0.0240],
         ...,
         [ 0.0546, -0.0279,  0.0175, -0.0328, -0.0483],
         [ 0.0532, -0.0204,  0.0396,  0.0378, -0.0361],
         [ 0.0267, -0.0551, -0.0053, -0.0142, -0.0090]],

        ...,

        [[-0.0477, -0.0505,  0.0310,  0.0422,  0.0457],
         [-0.0293, -0.0361, -0.0409, -0.0054,  0.0356],
         [ 0.0281,  0.0491, -0.0228,  0.0474, -0.0226],
         ...,
         [ 0.0085,  0.0486, -0.0494,  0.0506,  0.0161],
         [ 0.0318,  0.0403, -0.0126, -0.0433,  0.0090],
         [ 0.0186, -0.0376,  0.0377, -0.0441,  0.0089]],

        [[-0.0529, -0.0501,  0.0453, -0.0431, -0.0019],
         [-0.0292,  0.0338,  0.0170,  0.0268, -0.0038],
         [-0.0112,  0.0170,  0.0096, -0.0053, -0.0020],
         ...,
         [ 0.0211,  0.0496, -0.0416,  0.0355, -0.0046],
         [ 0.0255,  0.0460,  0.0415, -0.0307,  0.0130],
         [ 0.0048,  0.0503,  0.0074, -0.0438, -0.0265]],

        [[ 0.0247,  0.0221, -0.0234,  0.0502, -0.0549],
         [-0.0090, -0.0014,  0.0221,  0.0153,  0.0137],
         [ 0.0230,  0.0089, -0.0253,  0.0080, -0.0016],
         ...,
         [-0.0449, -0.0292,  0.0301, -0.0041,  0.0114],
         [ 0.0387,  0.0142, -0.0176,  0.0145,  0.0414],
         [ 0.0173,  0.0252,  0.0454,  0.0539, -0.0155]]], requires_grad=True)

**Weights after *optim.step()***

tensor([[[ 0.0216,     nan,     nan,     nan,     nan],
         [ 0.0302,     nan,     nan,     nan,     nan],
         [ 0.0126,     nan,     nan,     nan,     nan],
         ...,
         [-0.0453,     nan,     nan,     nan,     nan],
         [-0.0332,     nan,     nan,     nan,     nan],
         [-0.0450,     nan,     nan,     nan,     nan]],

        [[-0.0161,     nan,     nan,     nan,     nan],
         [ 0.0287,     nan,     nan,     nan,     nan],
         [-0.0397,     nan,     nan,     nan,     nan],
         ...,
         [-0.0120,     nan,     nan,     nan,     nan],
         [ 0.0274,     nan,     nan,     nan,     nan],
         [ 0.0230,     nan,     nan,     nan,     nan]],

        [[-0.0398,     nan,     nan,     nan,     nan],
         [-0.0192,     nan,     nan,     nan,     nan],
         [ 0.0518,     nan,     nan,     nan,     nan],
         ...,
         [ 0.0546,     nan,     nan,     nan,     nan],
         [ 0.0532,     nan,     nan,     nan,     nan],
         [ 0.0267,     nan,     nan,     nan,     nan]],

        ...,

        [[-0.0477,     nan,     nan,     nan,     nan],
         [-0.0293,     nan,     nan,     nan,     nan],
         [ 0.0281,     nan,     nan,     nan,     nan],
         ...,
         [ 0.0085,     nan,     nan,     nan,     nan],
         [ 0.0318,     nan,     nan,     nan,     nan],
         [ 0.0186,     nan,     nan,     nan,     nan]],

        [[-0.0529,     nan,     nan,     nan,     nan],
         [-0.0292,     nan,     nan,     nan,     nan],
         [-0.0112,     nan,     nan,     nan,     nan],
         ...,
         [ 0.0211,     nan,     nan,     nan,     nan],
         [ 0.0255,     nan,     nan,     nan,     nan],
         [ 0.0048,     nan,     nan,     nan,     nan]],

        [[ 0.0247,     nan,     nan,     nan,     nan],
         [-0.0090,     nan,     nan,     nan,     nan],
         [ 0.0230,     nan,     nan,     nan,     nan],
         ...,
         [-0.0449,     nan,     nan,     nan,     nan],
         [ 0.0387,     nan,     nan,     nan,     nan],
         [ 0.0173,     nan,     nan,     nan,     nan]]], requires_grad=True)

Could you check, if the model output contains NaN values before calculating the loss and calling backward()?
Did you check the inputs, if they contain finite elements?
Also, could you rerun the script with torch.autograd.set_detect_anomaly(True) and post the stack trace here, please?

If I understand it correctly, you are getting these NaN values in the second iteration?
If so, are you able to reproduce this behavior with random inputs, so that we could debug it?

When I use torch.autograd.set_detect_anomaly(True) in train function after optim step, then it shows the following traces
Sometimes first few batches run smoothly, but it starts suddenly by giving NaN values in the Weights(in Kernels and biases).
I checked the input data and my input data is fine as it doesn’t contain any NaN Vlaue after computing MFCC features
Model contains NaN after optim step

Note: When I tried to replace ReLU with Tanh, it works fine somehow but after some iterations (>50), it starts to give NaN values again. When I tried to analyse the weights, they don’t change. I am confused

loss.backward()
(Pdb) n
Warning: Error detected in NativeBatchNormBackward. Traceback of forward call that caused the error:
  File "Code/main.py", line 178, in <module>
    main()
  File "Code/main.py", line 134, in main
    train_avg_loss, correct_train, total_train = initiate_train_val.train(model, criterion, optim, epoch, train_stutter_dataset, lr, device)
  File "/home/shsheikh/PhD/audio_modality/dysfluencydetectionaudiomodality/Code/train.py", line 89, in train
    output = model(batch)                # Forward Pass
  File "/home/shsheikh/anaconda3/envs/pytorch/lib/python3.5/site-packages/torch/nn/modules/module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/shsheikh/PhD/audio_modality/dysfluencydetectionaudiomodality/Code/tdnn_debug.py", line 143, in forward
    input = self.bn8(input)
  File "/home/shsheikh/anaconda3/envs/pytorch/lib/python3.5/site-packages/torch/nn/modules/module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/shsheikh/anaconda3/envs/pytorch/lib/python3.5/site-packages/torch/nn/modules/batchnorm.py", line 106, in forward
    exponential_average_factor, self.eps)
  File "/home/shsheikh/anaconda3/envs/pytorch/lib/python3.5/site-packages/torch/nn/functional.py", line 1923, in batch_norm
    training, momentum, eps, torch.backends.cudnn.enabled
 (print_stack at /opt/conda/conda-bld/pytorch_1591914985702/work/torch/csrc/autograd/python_anomaly_mode.cpp:60)
RuntimeError: Function 'NativeBatchNormBackward' returned nan values in its 0th output.
> /home/shsheikh/PhD/audio_modality/dysfluencydetectionaudiomodality/Code/train.py(97)train()

#The Second one 
Warning: Error detected in LogSoftmaxBackward. Traceback of forward call that caused the error:
  File "Code/main.py", line 178, in <module>
    main()
  File "Code/main.py", line 134, in main
    train_avg_loss, correct_train, total_train = initiate_train_val.train(model, criterion, optim, epoch, train_stutter_dataset, lr, device)
  File "/home/shsheikh/PhD/audio_modality/dysfluencydetectionaudiomodality/Code/train.py", line 92, in train
    loss = criterion(output, label)      #Loss
  File "/home/shsheikh/anaconda3/envs/pytorch/lib/python3.5/site-packages/torch/nn/modules/module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/shsheikh/anaconda3/envs/pytorch/lib/python3.5/site-packages/torch/nn/modules/loss.py", line 932, in forward
    ignore_index=self.ignore_index, reduction=self.reduction)
  File "/home/shsheikh/anaconda3/envs/pytorch/lib/python3.5/site-packages/torch/nn/functional.py", line 2317, in cross_entropy
    return nll_loss(log_softmax(input, 1), target, weight, None, ignore_index, None, reduction)
  File "/home/shsheikh/anaconda3/envs/pytorch/lib/python3.5/site-packages/torch/nn/functional.py", line 1535, in log_softmax
    ret = input.log_softmax(dim)
 (print_stack at /opt/conda/conda-bld/pytorch_1591914985702/work/torch/csrc/autograd/python_anomaly_mode.cpp:60)
Traceback (most recent call last):
  File "Code/main.py", line 178, in <module>
    main()
  File "Code/main.py", line 134, in main
    train_avg_loss, correct_train, total_train = initiate_train_val.train(model, criterion, optim, epoch, train_stutter_dataset, lr, device)
  File "/home/shsheikh/PhD/audio_modality/dysfluencydetectionaudiomodality/Code/train.py", line 97, in train
    loss.backward()
  File "/home/shsheikh/anaconda3/envs/pytorch/lib/python3.5/site-packages/torch/tensor.py", line 198, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph)
  File "/home/shsheikh/anaconda3/envs/pytorch/lib/python3.5/site-packages/torch/autograd/__init__.py", line 100, in backward
    allow_unreachable=True)  # allow_unreachable flag
RuntimeError: Function 'LogSoftmaxBackward' returned nan values in its 0th output.

Thanks for the update. Could you check for NaN values in all .grad attributes before calling optimizer.step()?
If the input and output are fine, but the step() call creates the NaNs, it seems that the gradients must become invalid values.

You can check it via:

for name, param in model.named_parameters():
    print(name, torch.isfinite(param.grad).all())

after the backward call.
If you get an invalid value in the gradients, could you store the input, target as well as the model.state_dict and upload it somewhere so that we could try to reproduce and debug this issue?

2 Likes

When I use ReLU as an activation function, the following lines

for name, param in model.named_parameters():
    print(name, torch.isfinite(param.grad).all())

give the output as
Note: The weights become NaN first, then after the convolution operations are performed, it forces the input data to NaN (NoneType)

Model Parameters conv1d_tdnn1.kernel tensor(True)
Model Parameters conv1d_tdnn1.bias tensor(True)
Model Parameters bn1.weight tensor(True)
Model Parameters bn1.bias tensor(True)
Model Parameters conv1d_tdnn2.kernel tensor(True)
Model Parameters conv1d_tdnn2.bias tensor(True)
Model Parameters bn2.weight tensor(True)
Model Parameters bn2.bias tensor(True)
Traceback (most recent call last):
  File "Code/main.py", line 180, in <module>
    main()
  File "Code/main.py", line 136, in main
    train_avg_loss, correct_train, total_train = initiate_train_val.train(model, criterion, optim, epoch, train_stutter_dataset, lr, device)
  File "/home/shsheikh/PhD/audio_modality/dysfluencydetectionaudiomodality/Code/train.py", line 101, in train
    print("Model Parameters",name, torch.isfinite(param.grad).all())
TypeError: isfinite(): argument 'input' (position 1) must be Tensor, not NoneType

...

But when I use Tanh as an activation function, it gives the following

=============TOTAL PARAMETERS================== 300917
Model Parameters conv1d_tdnn1.kernel tensor(True)
Model Parameters conv1d_tdnn1.bias tensor(True)
Model Parameters bn1.weight tensor(True)
Model Parameters bn1.bias tensor(True)
Model Parameters conv1d_tdnn2.kernel tensor(True)
Model Parameters conv1d_tdnn2.bias tensor(True)
Model Parameters bn2.weight tensor(True)
Model Parameters bn2.bias tensor(True)
Model Parameters conv1d_tdnn3.kernel tensor(True)
Model Parameters conv1d_tdnn3.bias tensor(True)
Model Parameters bn3.weight tensor(True)
Model Parameters bn3.bias tensor(True)
Model Parameters conv1d_tdnn4.kernel tensor(True)
Model Parameters conv1d_tdnn4.bias tensor(True)
Model Parameters bn4.weight tensor(True)
Model Parameters bn4.bias tensor(True)
Model Parameters conv1d_tdnn5.kernel tensor(True)
Model Parameters conv1d_tdnn5.bias tensor(True)
Model Parameters bn5.weight tensor(True)
Model Parameters bn5.bias tensor(True)
Model Parameters conv1d_tdnn6.kernel tensor(True)
Model Parameters conv1d_tdnn6.bias tensor(True)
Model Parameters bn6.weight tensor(True)
Model Parameters bn6.bias tensor(True)
Model Parameters conv1d1.weight tensor(True)
Model Parameters conv1d1.bias tensor(True)
Model Parameters bn7.weight tensor(True)
Model Parameters bn7.bias tensor(True)
Model Parameters conv1d2.weight tensor(True)
Model Parameters conv1d2.bias tensor(True)
Model Parameters bn8.weight tensor(True)
Model Parameters bn8.bias tensor(True)
Model Parameters fc1.weight tensor(True)
Model Parameters fc1.bias tensor(True)
Model Parameters conv1d_tdnn1.kernel tensor(True)
Model Parameters conv1d_tdnn1.bias tensor(True)
Model Parameters bn1.weight tensor(True)
Model Parameters bn1.bias tensor(True)
Model Parameters conv1d_tdnn2.kernel tensor(True)
Model Parameters conv1d_tdnn2.bias tensor(True)
Model Parameters bn2.weight tensor(True)
Model Parameters bn2.bias tensor(True)
Model Parameters conv1d_tdnn3.kernel tensor(True)
Model Parameters conv1d_tdnn3.bias tensor(True)
Model Parameters bn3.weight tensor(True)
Model Parameters bn3.bias tensor(True)
Model Parameters conv1d_tdnn4.kernel tensor(True)
Model Parameters conv1d_tdnn4.bias tensor(True)
Model Parameters bn4.weight tensor(True)
Model Parameters bn4.bias tensor(True)
Model Parameters conv1d_tdnn5.kernel tensor(True)
Model Parameters conv1d_tdnn5.bias tensor(True)
Model Parameters bn5.weight tensor(True)
Model Parameters bn5.bias tensor(True)
Model Parameters conv1d_tdnn6.kernel tensor(True)
Model Parameters conv1d_tdnn6.bias tensor(True)
Model Parameters bn6.weight tensor(True)
Model Parameters bn6.bias tensor(True)
Model Parameters conv1d1.weight tensor(True)
Model Parameters conv1d1.bias tensor(True)
Model Parameters bn7.weight tensor(True)
Model Parameters bn7.bias tensor(True)
Model Parameters conv1d2.weight tensor(True)
Model Parameters conv1d2.bias tensor(True)
Model Parameters bn8.weight tensor(True)
Model Parameters bn8.bias tensor(True)
Model Parameters fc1.weight tensor(True)
Model Parameters fc1.bias tensor(True)
Model Parameters conv1d_tdnn1.kernel tensor(True)
Model Parameters conv1d_tdnn1.bias tensor(True)
Model Parameters bn1.weight tensor(True)
Model Parameters bn1.bias tensor(True)
Model Parameters conv1d_tdnn2.kernel tensor(True)
Model Parameters conv1d_tdnn2.bias tensor(True)
Model Parameters bn2.weight tensor(True)
Model Parameters bn2.bias tensor(True)
Model Parameters conv1d_tdnn3.kernel tensor(True)
Model Parameters conv1d_tdnn3.bias tensor(True)
Model Parameters bn3.weight tensor(True)
Model Parameters bn3.bias tensor(True)
Model Parameters conv1d_tdnn4.kernel tensor(True)
Model Parameters conv1d_tdnn4.bias tensor(True)
Model Parameters bn4.weight tensor(True)
Model Parameters bn4.bias tensor(True)
Model Parameters conv1d_tdnn5.kernel tensor(True)
Model Parameters conv1d_tdnn5.bias tensor(True)
Model Parameters bn5.weight tensor(True)
Model Parameters bn5.bias tensor(True)
Model Parameters conv1d_tdnn6.kernel tensor(True)
Model Parameters conv1d_tdnn6.bias tensor(True)
Model Parameters bn6.weight tensor(True)
Model Parameters bn6.bias tensor(True)
Model Parameters conv1d1.weight tensor(True)
Model Parameters conv1d1.bias tensor(True)
Model Parameters bn7.weight tensor(True)
Model Parameters bn7.bias tensor(True)
Model Parameters conv1d2.weight tensor(True)
Model Parameters conv1d2.bias tensor(True)
Model Parameters bn8.weight tensor(True)
Model Parameters bn8.bias tensor(True)
Model Parameters fc1.weight tensor(True)
Model Parameters fc1.bias tensor(True)
Model Parameters conv1d_tdnn1.kernel tensor(True)
Model Parameters conv1d_tdnn1.bias tensor(True)
Model Parameters bn1.weight tensor(True)
Model Parameters bn1.bias tensor(True)
Model Parameters conv1d_tdnn2.kernel tensor(True)
Model Parameters conv1d_tdnn2.bias tensor(True)
Model Parameters bn2.weight tensor(True)
Model Parameters bn2.bias tensor(True)
Model Parameters conv1d_tdnn3.kernel tensor(True)
Model Parameters conv1d_tdnn3.bias tensor(True)
Model Parameters bn3.weight tensor(True)
Model Parameters bn3.bias tensor(True)
Model Parameters conv1d_tdnn4.kernel tensor(True)
Model Parameters conv1d_tdnn4.bias tensor(True)
Model Parameters bn4.weight tensor(True)
Model Parameters bn4.bias tensor(True)
Model Parameters conv1d_tdnn5.kernel tensor(True)
Model Parameters conv1d_tdnn5.bias tensor(True)
Model Parameters bn5.weight tensor(True)
Model Parameters bn5.bias tensor(True)
Model Parameters conv1d_tdnn6.kernel tensor(True)
Model Parameters conv1d_tdnn6.bias tensor(True)
Model Parameters bn6.weight tensor(True)
Model Parameters bn6.bias tensor(True)
Model Parameters conv1d1.weight tensor(True)
Model Parameters conv1d1.bias tensor(True)
Model Parameters bn7.weight tensor(True)
Model Parameters bn7.bias tensor(True)
Model Parameters conv1d2.weight tensor(True)
Model Parameters conv1d2.bias tensor(True)
Model Parameters bn8.weight tensor(True)
Model Parameters bn8.bias tensor(True)
Model Parameters fc1.weight tensor(True)
Model Parameters fc1.bias tensor(True)
Model Parameters conv1d_tdnn1.kernel tensor(True)
Model Parameters conv1d_tdnn1.bias tensor(True)
Model Parameters bn1.weight tensor(True)
Model Parameters bn1.bias tensor(True)
Model Parameters conv1d_tdnn2.kernel tensor(True)
Model Parameters conv1d_tdnn2.bias tensor(True)
Model Parameters bn2.weight tensor(True)
Model Parameters bn2.bias tensor(True)
Model Parameters conv1d_tdnn3.kernel tensor(True)
Model Parameters conv1d_tdnn3.bias tensor(True)
Model Parameters bn3.weight tensor(True)
Model Parameters bn3.bias tensor(True)
Model Parameters conv1d_tdnn4.kernel tensor(True)
Model Parameters conv1d_tdnn4.bias tensor(True)
Model Parameters bn4.weight tensor(True)
Model Parameters bn4.bias tensor(True)
Model Parameters conv1d_tdnn5.kernel tensor(True)
Model Parameters conv1d_tdnn5.bias tensor(True)
Model Parameters bn5.weight tensor(True)
Model Parameters bn5.bias tensor(True)
Model Parameters conv1d_tdnn6.kernel tensor(True)
Model Parameters conv1d_tdnn6.bias tensor(True)
Model Parameters bn6.weight tensor(True)
Model Parameters bn6.bias tensor(True)
Model Parameters conv1d1.weight tensor(True)
Model Parameters conv1d1.bias tensor(True)
Model Parameters bn7.weight tensor(True)
Model Parameters bn7.bias tensor(True)
Model Parameters conv1d2.weight tensor(True)
Model Parameters conv1d2.bias tensor(True)
Model Parameters bn8.weight tensor(True)
Model Parameters bn8.bias tensor(True)
Model Parameters fc1.weight tensor(True)
Model Parameters fc1.bias tensor(True)
Model Parameters conv1d_tdnn1.kernel tensor(True)
Model Parameters conv1d_tdnn1.bias tensor(True)
Model Parameters bn1.weight tensor(True)
Model Parameters bn1.bias tensor(True)
Model Parameters conv1d_tdnn2.kernel tensor(True)
Model Parameters conv1d_tdnn2.bias tensor(True)
Model Parameters bn2.weight tensor(True)
Model Parameters bn2.bias tensor(True)
Model Parameters conv1d_tdnn3.kernel tensor(True)
Model Parameters conv1d_tdnn3.bias tensor(True)
Model Parameters bn3.weight tensor(True)
Model Parameters bn3.bias tensor(True)
Model Parameters conv1d_tdnn4.kernel tensor(True)
Model Parameters conv1d_tdnn4.bias tensor(True)
Model Parameters bn4.weight tensor(True)
Model Parameters bn4.bias tensor(True)
Model Parameters conv1d_tdnn5.kernel tensor(True)
Model Parameters conv1d_tdnn5.bias tensor(True)
Model Parameters bn5.weight tensor(True)
Model Parameters bn5.bias tensor(True)
Model Parameters conv1d_tdnn6.kernel tensor(True)
Model Parameters conv1d_tdnn6.bias tensor(True)
Model Parameters bn6.weight tensor(True)
Model Parameters bn6.bias tensor(True)
Model Parameters conv1d1.weight tensor(True)
Model Parameters conv1d1.bias tensor(True)
Model Parameters bn7.weight tensor(True)
Model Parameters bn7.bias tensor(True)
Model Parameters conv1d2.weight tensor(True)
Model Parameters conv1d2.bias tensor(True)
Model Parameters bn8.weight tensor(True)
Model Parameters bn8.bias tensor(True)
Model Parameters fc1.weight tensor(True)
Model Parameters fc1.bias tensor(True)
Model Parameters conv1d_tdnn1.kernel tensor(True)
Model Parameters conv1d_tdnn1.bias tensor(True)
Model Parameters bn1.weight tensor(True)
Model Parameters bn1.bias tensor(True)
Model Parameters conv1d_tdnn2.kernel tensor(True)
Model Parameters conv1d_tdnn2.bias tensor(True)
Model Parameters bn2.weight tensor(True)
Model Parameters bn2.bias tensor(True)
Model Parameters conv1d_tdnn3.kernel tensor(True)
Model Parameters conv1d_tdnn3.bias tensor(True)
Model Parameters bn3.weight tensor(True)
Model Parameters bn3.bias tensor(True)
Model Parameters conv1d_tdnn4.kernel tensor(True)
Model Parameters conv1d_tdnn4.bias tensor(True)
Model Parameters bn4.weight tensor(True)
Model Parameters bn4.bias tensor(True)
Model Parameters conv1d_tdnn5.kernel tensor(True)
Model Parameters conv1d_tdnn5.bias tensor(True)
Model Parameters bn5.weight tensor(True)
Model Parameters bn5.bias tensor(True)
Model Parameters conv1d_tdnn6.kernel tensor(True)
Model Parameters conv1d_tdnn6.bias tensor(True)
Model Parameters bn6.weight tensor(True)
Model Parameters bn6.bias tensor(True)
Model Parameters conv1d1.weight tensor(True)
Model Parameters conv1d1.bias tensor(True)
Model Parameters bn7.weight tensor(True)
Model Parameters bn7.bias tensor(True)
Model Parameters conv1d2.weight tensor(True)
Model Parameters conv1d2.bias tensor(True)
Model Parameters bn8.weight tensor(True)
Model Parameters bn8.bias tensor(True)
Model Parameters fc1.weight tensor(True)
Model Parameters fc1.bias tensor(True)
Model Parameters conv1d_tdnn1.kernel tensor(True)
Model Parameters conv1d_tdnn1.bias tensor(True)
Model Parameters bn1.weight tensor(True)
Model Parameters bn1.bias tensor(True)
Model Parameters conv1d_tdnn2.kernel tensor(True)
Model Parameters conv1d_tdnn2.bias tensor(True)
Model Parameters bn2.weight tensor(True)
Model Parameters bn2.bias tensor(True)
Model Parameters conv1d_tdnn3.kernel tensor(True)
Model Parameters conv1d_tdnn3.bias tensor(True)
Model Parameters bn3.weight tensor(True)
Model Parameters bn3.bias tensor(True)
Model Parameters conv1d_tdnn4.kernel tensor(True)
Model Parameters conv1d_tdnn4.bias tensor(True)
Model Parameters bn4.weight tensor(True)
Model Parameters bn4.bias tensor(True)
Model Parameters conv1d_tdnn5.kernel tensor(True)
Model Parameters conv1d_tdnn5.bias tensor(True)
Model Parameters bn5.weight tensor(True)
Model Parameters bn5.bias tensor(True)
Model Parameters conv1d_tdnn6.kernel tensor(True)
Model Parameters conv1d_tdnn6.bias tensor(True)
Model Parameters bn6.weight tensor(True)
Model Parameters bn6.bias tensor(True)
Model Parameters conv1d1.weight tensor(True)
Model Parameters conv1d1.bias tensor(True)
Model Parameters bn7.weight tensor(True)
Model Parameters bn7.bias tensor(True)
Model Parameters conv1d2.weight tensor(True)
Model Parameters conv1d2.bias tensor(True)
Model Parameters bn8.weight tensor(True)
Model Parameters bn8.bias tensor(True)
Model Parameters fc1.weight tensor(True)
Model Parameters fc1.bias tensor(True)
Model Parameters conv1d_tdnn1.kernel tensor(True)
Model Parameters conv1d_tdnn1.bias tensor(True)
Model Parameters bn1.weight tensor(True)
Model Parameters bn1.bias tensor(True)
Model Parameters conv1d_tdnn2.kernel tensor(True)
Model Parameters conv1d_tdnn2.bias tensor(True)
Model Parameters bn2.weight tensor(True)
Model Parameters bn2.bias tensor(True)
Model Parameters conv1d_tdnn3.kernel tensor(True)
Model Parameters conv1d_tdnn3.bias tensor(True)
Model Parameters bn3.weight tensor(True)
Model Parameters bn3.bias tensor(True)
Model Parameters conv1d_tdnn4.kernel tensor(True)
Model Parameters conv1d_tdnn4.bias tensor(True)
Model Parameters bn4.weight tensor(True)
Model Parameters bn4.bias tensor(True)
Model Parameters conv1d_tdnn5.kernel tensor(True)
Model Parameters conv1d_tdnn5.bias tensor(True)
Model Parameters bn5.weight tensor(True)
Model Parameters bn5.bias tensor(True)
Model Parameters conv1d_tdnn6.kernel tensor(True)
Model Parameters conv1d_tdnn6.bias tensor(True)
Model Parameters bn6.weight tensor(True)
Model Parameters bn6.bias tensor(True)
Model Parameters conv1d1.weight tensor(True)
Model Parameters conv1d1.bias tensor(True)
Model Parameters bn7.weight tensor(True)
Model Parameters bn7.bias tensor(True)
Model Parameters conv1d2.weight tensor(True)
Model Parameters conv1d2.bias tensor(True)
Model Parameters bn8.weight tensor(True)
Model Parameters bn8.bias tensor(True)
Model Parameters fc1.weight tensor(True)
Model Parameters fc1.bias tensor(True)
Model Parameters conv1d_tdnn1.kernel tensor(True)
Model Parameters conv1d_tdnn1.bias tensor(True)
Model Parameters bn1.weight tensor(True)
Model Parameters bn1.bias tensor(True)
Model Parameters conv1d_tdnn2.kernel tensor(True)
Model Parameters conv1d_tdnn2.bias tensor(True)
Model Parameters bn2.weight tensor(True)
Model Parameters bn2.bias tensor(True)
Model Parameters conv1d_tdnn3.kernel tensor(True)
Model Parameters conv1d_tdnn3.bias tensor(True)
Model Parameters bn3.weight tensor(True)
Model Parameters bn3.bias tensor(True)
Model Parameters conv1d_tdnn4.kernel tensor(True)
Model Parameters conv1d_tdnn4.bias tensor(True)
Model Parameters bn4.weight tensor(True)
Model Parameters bn4.bias tensor(True)
Model Parameters conv1d_tdnn5.kernel tensor(True)
Model Parameters conv1d_tdnn5.bias tensor(True)
Model Parameters bn5.weight tensor(True)
Model Parameters bn5.bias tensor(True)
Model Parameters conv1d_tdnn6.kernel tensor(True)
Model Parameters conv1d_tdnn6.bias tensor(True)
Model Parameters bn6.weight tensor(True)
Model Parameters bn6.bias tensor(True)
Model Parameters conv1d1.weight tensor(True)
Model Parameters conv1d1.bias tensor(True)
Model Parameters bn7.weight tensor(True)
Model Parameters bn7.bias tensor(True)
Model Parameters conv1d2.weight tensor(True)
Model Parameters conv1d2.bias tensor(True)
Model Parameters bn8.weight tensor(True)
Model Parameters bn8.bias tensor(True)
Model Parameters fc1.weight tensor(True)
Model Parameters fc1.bias tensor(True)
Model Parameters conv1d_tdnn1.kernel tensor(True)
Model Parameters conv1d_tdnn1.bias tensor(True)
Model Parameters bn1.weight tensor(True)
Model Parameters bn1.bias tensor(True)
Model Parameters conv1d_tdnn2.kernel tensor(True)
Model Parameters conv1d_tdnn2.bias tensor(True)
Model Parameters bn2.weight tensor(True)
Model Parameters bn2.bias tensor(True)
Model Parameters conv1d_tdnn3.kernel tensor(True)
Model Parameters conv1d_tdnn3.bias tensor(True)
Model Parameters bn3.weight tensor(True)
Model Parameters bn3.bias tensor(True)
Model Parameters conv1d_tdnn4.kernel tensor(True)
Model Parameters conv1d_tdnn4.bias tensor(True)
Model Parameters bn4.weight tensor(True)
Model Parameters bn4.bias tensor(True)
Model Parameters conv1d_tdnn5.kernel tensor(True)
Model Parameters conv1d_tdnn5.bias tensor(True)
Model Parameters bn5.weight tensor(True)
Model Parameters bn5.bias tensor(True)
Model Parameters conv1d_tdnn6.kernel tensor(True)
Model Parameters conv1d_tdnn6.bias tensor(True)
Model Parameters bn6.weight tensor(True)
Model Parameters bn6.bias tensor(True)
Model Parameters conv1d1.weight tensor(True)
Model Parameters conv1d1.bias tensor(True)
Model Parameters bn7.weight tensor(True)
Model Parameters bn7.bias tensor(True)
Model Parameters conv1d2.weight tensor(True)
Model Parameters conv1d2.bias tensor(True)
Model Parameters bn8.weight tensor(True)
Model Parameters bn8.bias tensor(True)
Model Parameters fc1.weight tensor(True)
Model Parameters fc1.bias tensor(True)
Model Parameters conv1d_tdnn1.kernel tensor(True)
Model Parameters conv1d_tdnn1.bias tensor(True)
Model Parameters bn1.weight tensor(True)
Model Parameters bn1.bias tensor(True)
Model Parameters conv1d_tdnn2.kernel tensor(True)
Model Parameters conv1d_tdnn2.bias tensor(True)
Model Parameters bn2.weight tensor(True)
Model Parameters bn2.bias tensor(True)
Model Parameters conv1d_tdnn3.kernel tensor(True)
Model Parameters conv1d_tdnn3.bias tensor(True)
Model Parameters bn3.weight tensor(True)
Model Parameters bn3.bias tensor(True)
Model Parameters conv1d_tdnn4.kernel tensor(True)
Model Parameters conv1d_tdnn4.bias tensor(True)
Model Parameters bn4.weight tensor(True)
Model Parameters bn4.bias tensor(True)
Model Parameters conv1d_tdnn5.kernel tensor(True)
Model Parameters conv1d_tdnn5.bias tensor(True)
Model Parameters bn5.weight tensor(True)
Model Parameters bn5.bias tensor(True)
Model Parameters conv1d_tdnn6.kernel tensor(True)
Model Parameters conv1d_tdnn6.bias tensor(True)
Model Parameters bn6.weight tensor(True)
Model Parameters bn6.bias tensor(True)
Model Parameters conv1d1.weight tensor(True)
Model Parameters conv1d1.bias tensor(True)
Model Parameters bn7.weight tensor(True)
Model Parameters bn7.bias tensor(True)
Model Parameters conv1d2.weight tensor(True)
Model Parameters conv1d2.bias tensor(True)
Model Parameters bn8.weight tensor(True)
Model Parameters bn8.bias tensor(True)
Model Parameters fc1.weight tensor(True)
Model Parameters fc1.bias tensor(True)
Model Parameters conv1d_tdnn1.kernel tensor(True)
Model Parameters conv1d_tdnn1.bias tensor(True)
Model Parameters bn1.weight tensor(True)
Model Parameters bn1.bias tensor(True)
Model Parameters conv1d_tdnn2.kernel tensor(True)
Model Parameters conv1d_tdnn2.bias tensor(True)
Model Parameters bn2.weight tensor(True)
Model Parameters bn2.bias tensor(True)
Model Parameters conv1d_tdnn3.kernel tensor(True)
Model Parameters conv1d_tdnn3.bias tensor(True)
Model Parameters bn3.weight tensor(True)
Model Parameters bn3.bias tensor(True)
Model Parameters conv1d_tdnn4.kernel tensor(True)
Model Parameters conv1d_tdnn4.bias tensor(True)
Model Parameters bn4.weight tensor(True)
Model Parameters bn4.bias tensor(True)
Model Parameters conv1d_tdnn5.kernel tensor(True)
Model Parameters conv1d_tdnn5.bias tensor(True)
Model Parameters bn5.weight tensor(True)
Model Parameters bn5.bias tensor(True)
Model Parameters conv1d_tdnn6.kernel tensor(True)
Model Parameters conv1d_tdnn6.bias tensor(True)
Model Parameters bn6.weight tensor(True)
Model Parameters bn6.bias tensor(True)
Model Parameters conv1d1.weight tensor(True)
Model Parameters conv1d1.bias tensor(True)
Model Parameters bn7.weight tensor(True)
Model Parameters bn7.bias tensor(True)
Model Parameters conv1d2.weight tensor(True)
Model Parameters conv1d2.bias tensor(True)
Model Parameters bn8.weight tensor(True)
Model Parameters bn8.bias tensor(True)
Model Parameters fc1.weight tensor(True)
Model Parameters fc1.bias tensor(True)
Model Parameters conv1d_tdnn1.kernel tensor(True)
Model Parameters conv1d_tdnn1.bias tensor(True)
Model Parameters bn1.weight tensor(True)
Model Parameters bn1.bias tensor(True)
Model Parameters conv1d_tdnn2.kernel tensor(True)
Model Parameters conv1d_tdnn2.bias tensor(True)
Model Parameters bn2.weight tensor(True)
Model Parameters bn2.bias tensor(True)
Model Parameters conv1d_tdnn3.kernel tensor(True)
Model Parameters conv1d_tdnn3.bias tensor(True)
Model Parameters bn3.weight tensor(True)
Model Parameters bn3.bias tensor(True)
Model Parameters conv1d_tdnn4.kernel tensor(True)
Model Parameters conv1d_tdnn4.bias tensor(True)
Model Parameters bn4.weight tensor(True)
Model Parameters bn4.bias tensor(True)
Model Parameters conv1d_tdnn5.kernel tensor(True)
Model Parameters conv1d_tdnn5.bias tensor(True)
Model Parameters bn5.weight tensor(True)
Model Parameters bn5.bias tensor(True)
Model Parameters conv1d_tdnn6.kernel tensor(True)
Model Parameters conv1d_tdnn6.bias tensor(True)
Model Parameters bn6.weight tensor(True)
Model Parameters bn6.bias tensor(True)
Model Parameters conv1d1.weight tensor(True)
Model Parameters conv1d1.bias tensor(True)
Model Parameters bn7.weight tensor(True)
Model Parameters bn7.bias tensor(True)
Model Parameters conv1d2.weight tensor(True)
Model Parameters conv1d2.bias tensor(True)
Model Parameters bn8.weight tensor(True)
Model Parameters bn8.bias tensor(True)
Model Parameters fc1.weight tensor(True)
Model Parameters fc1.bias tensor(True)
....
Model State Dictionary <bound method Module.state_dict of TimeDelayNN(
  (conv1d_tdnn1): TimeDelayBaseModel()
  (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (tanh): Tanh()
  (relu): ReLU()
  (conv1d_tdnn2): TimeDelayBaseModel()
  (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv1d_tdnn3): TimeDelayBaseModel()
  (bn3): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv1d_tdnn4): TimeDelayBaseModel()
  (bn4): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout1): Dropout(p=0.2, inplace=False)
  (conv1d_tdnn5): TimeDelayBaseModel()
  (bn5): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv1d_tdnn6): TimeDelayBaseModel()
  (bn6): BatchNorm1d(64, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True)
  (conv1d1): Conv1d(128, 64, kernel_size=(1,), stride=(1,))
  (bn7): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout2): Dropout(p=0.5, inplace=False)
  (conv1d2): Conv1d(64, 16, kernel_size=(1,), stride=(1,))
  (bn8): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=32, out_features=5, bias=True)

Do you mean the sample data-set ? and the target values ?

It seems there are different issues now.

If you are using ReLU as the activation function, it seems that you are somehow detaching (some) parameters, since conv1d_tdnn2.kernel.grad seems to be None suddently (while it works using tanh).

Anyway, using tanh returns that the gradients of all parameters are finite before the update step.
Nevertheless the next update creates invalid parameters.

The only reason I see now is that you might be dealing with an under/overflow of some values.

Could you check the max. absolute values of the gradients as well as parameters before the critical update step, which creates the NaNs?

Output for torch.max(abs(param.grad))

Model Parameters Gradients conv1d_tdnn1.kernel tensor(0.0776)
Model Parameters Gradients conv1d_tdnn1.bias tensor(0.0294)
Model Parameters Gradients bn1.weight tensor(0.0034)
Model Parameters Gradients bn1.bias tensor(0.0023)
Model Parameters Gradients conv1d_tdnn2.kernel tensor(0.0044)
Model Parameters Gradients conv1d_tdnn2.bias tensor(0.0018)
Model Parameters Gradients bn2.weight tensor(0.0022)
Model Parameters Gradients bn2.bias tensor(0.0007)
Traceback (most recent call last):
  File "Code/main.py", line 180, in <module>
    main()
  File "Code/main.py", line 136, in main
    train_avg_loss, correct_train, total_train = initiate_train_val.train(model, criterion, optim, epoch, train_stutter_dataset, lr, device)
  File "/home/shsheikh/PhD/audio_modality/dysfluencydetectionaudiomodality/Code/train.py", line 106, in train
    print("Model Parameters Gradients",name,torch.max(abs(param.grad)))
TypeError: bad operand type for abs(): 'NoneType'

Output for loss.data and loss.grad

loss.data: >>>>>>>>>>>>>>>>>>>>>  tensor(1.6281)
loss.grad: >>>>>>>>>>>>>>>>>>>>>  None
Model Parameters Gradients conv1d_tdnn1.kernel tensor(nan)
Model Parameters Gradients conv1d_tdnn1.bias tensor(nan)
Model Parameters Gradients bn1.weight tensor(nan)
Model Parameters Gradients bn1.bias tensor(nan)
Model Parameters Gradients conv1d_tdnn2.kernel tensor(nan)
Model Parameters Gradients conv1d_tdnn2.bias tensor(nan)
Model Parameters Gradients bn2.weight tensor(nan)
Model Parameters Gradients bn2.bias tensor(nan)
Traceback (most recent call last):
  File "Code/main.py", line 180, in <module>
    main()
  File "Code/main.py", line 136, in main
    train_avg_loss, correct_train, total_train = initiate_train_val.train(model, criterion, optim, epoch, train_stutter_dataset, lr, device)
  File "/home/shsheikh/PhD/audio_modality/dysfluencydetectionaudiomodality/Code/train.py", line 108, in train
    print("Model Parameters Gradients",name,torch.max(abs(param.grad)))
TypeError: bad operand type for abs(): 'NoneType'

When I print the same with Tanh, it is still giving the Nan Values

=============TOTAL PARAMETERS================== 660469
Model Parameters Gradients conv1d_tdnn1.kernel tensor(0.)
Model Parameters Gradients conv1d_tdnn1.bias tensor(0.)
Model Parameters Gradients bn1.weight tensor(0.)
Model Parameters Gradients bn1.bias tensor(0.)
Model Parameters Gradients conv1d_tdnn2.kernel tensor(0.)
Model Parameters Gradients conv1d_tdnn2.bias tensor(0.)
Model Parameters Gradients bn2.weight tensor(0.)
Model Parameters Gradients bn2.bias tensor(0.)
Model Parameters Gradients conv1d_tdnn3.kernel tensor(0.)
Model Parameters Gradients conv1d_tdnn3.bias tensor(0.)
Model Parameters Gradients bn3.weight tensor(0.)
Model Parameters Gradients bn3.bias tensor(0.)
Model Parameters Gradients conv1d_tdnn4.kernel tensor(0.)
Model Parameters Gradients conv1d_tdnn4.bias tensor(0.)
Model Parameters Gradients bn4.weight tensor(0.)
Model Parameters Gradients bn4.bias tensor(0.)
Model Parameters Gradients conv1d_tdnn5.kernel tensor(0.)
Model Parameters Gradients conv1d_tdnn5.bias tensor(0.)
Model Parameters Gradients bn5.weight tensor(0.)
Model Parameters Gradients bn5.bias tensor(0.)
Model Parameters Gradients conv1d_tdnn6.kernel tensor(0.)
Model Parameters Gradients conv1d_tdnn6.bias tensor(0.)
Model Parameters Gradients bn6.weight tensor(0.)
Model Parameters Gradients bn6.bias tensor(0.)
Model Parameters Gradients conv1d_tdnn7.kernel tensor(0.)
Model Parameters Gradients conv1d_tdnn7.bias tensor(0.)
Model Parameters Gradients bn7.weight tensor(0.)
Model Parameters Gradients bn7.bias tensor(0.)
Model Parameters Gradients conv1d_tdnn8.kernel tensor(nan)
Model Parameters Gradients conv1d_tdnn8.bias tensor(nan)
Model Parameters Gradients bn8.weight tensor(nan)
Model Parameters Gradients bn8.bias tensor(nan)
Model Parameters Gradients conv1d1.weight tensor(nan)
Model Parameters Gradients conv1d1.bias tensor(nan)
Model Parameters Gradients bn9.weight tensor(nan)
Model Parameters Gradients bn9.bias tensor(nan)
Model Parameters Gradients conv1d3.weight tensor(nan)
Model Parameters Gradients conv1d3.bias tensor(nan)
Model Parameters Gradients bn11.weight tensor(nan)
Model Parameters Gradients bn11.bias tensor(nan)
Model Parameters Gradients conv1d2.weight tensor(nan)
Model Parameters Gradients conv1d2.bias tensor(nan)
Model Parameters Gradients bn10.weight tensor(nan)
Model Parameters Gradients bn10.bias tensor(nan)
Model Parameters Gradients fc1.weight tensor(4.6971e-09)
Model Parameters Gradients fc1.bias tensor(0.0470)
[Gradients ====>]
Kernel and bias after tdnn layers conv Parameter containing:
tensor([[[nan, nan],
         [nan, nan],
         [nan, nan],
         ...,
         [nan, nan],
         [nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan],
         [nan, nan],
         ...,
         [nan, nan],
         [nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan],
         [nan, nan],
         ...,
         [nan, nan],
         [nan, nan],
         [nan, nan]],

        ...,

        [[nan, nan],
         [nan, nan],
         [nan, nan],
         ...,
         [nan, nan],
         [nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan],
         [nan, nan],
         ...,
         [nan, nan],
         [nan, nan],
         [nan, nan]],

        [[nan, nan],
         [nan, nan],
         [nan, nan],
         ...,
         [nan, nan],
         [nan, nan],
         [nan, nan]]], requires_grad=True) Parameter containing:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       requires_grad=True)

@ptrblck

I’m a bit confused now.

In the previous post you’ve mentioned, that all gradients contain valid values and the parameters get suddently NaNs:

Your print statement also shows that all gradients are valid, before the first NaNs are created.

Now some gradients seems to contain NaNs?

Could you add somme more information to your debugging, please?
I.e. in particular when and where are the first NaN values observed?

Sorry, I think I mixed the two
Here is it again
1.
When I tried to debug the model (With ReLU activation function),
The NaN value appears just after the optim.step ()

My loss.grad is always None. Why is it so ?
when lr = 1e-3, the gradients change as can be shown below

[TIME DELAY KERNELS  Initially] :
tensor([[[ 0.0464,  0.1265, -0.0264, -0.1229,  0.0058],
         [ 0.0775,  0.0588, -0.0519, -0.0402, -0.0509],
         [-0.1246,  0.1113,  0.0533, -0.0575,  0.0025],
         ...,
         [ 0.0988, -0.0924, -0.0649, -0.1004,  0.1123],
         [-0.0736,  0.0166, -0.1075, -0.0556,  0.0434],
         [ 0.0260,  0.0104, -0.0937, -0.0415,  0.1184]],

        [[-0.0510, -0.0046,  0.0368,  0.0506,  0.0200],
         [-0.0930, -0.0011, -0.0559,  0.0898,  0.0883],
         [ 0.0701, -0.1162, -0.1195, -0.0312,  0.0920],
         ...,
         [-0.0935,  0.0423,  0.0812, -0.0165,  0.0404],
         [-0.0876, -0.0143,  0.0941, -0.0526,  0.1147],
         [ 0.0211, -0.0701, -0.1012,  0.0667,  0.0170]],

        [[-0.1150,  0.0097, -0.0775,  0.0942,  0.0081],
         [-0.0979,  0.1238,  0.0203,  0.0908,  0.0247],
         [ 0.0847, -0.1289, -0.0263,  0.0161,  0.0947],
         ...,
         [ 0.0700, -0.1172, -0.0993,  0.0552,  0.0867],
         [-0.0777,  0.0926, -0.0298,  0.1178, -0.1251],
         [ 0.0866,  0.0364, -0.0255,  0.1266, -0.1166]],

        ...,

        [[ 0.1249, -0.0482,  0.0779, -0.0275, -0.1257],
         [ 0.0598, -0.0873, -0.0151, -0.0356, -0.1173],
         [ 0.0811,  0.0579,  0.1214, -0.0987, -0.1275],
         ...,
         [ 0.0394, -0.0167, -0.0050, -0.0497,  0.0201],
         [-0.0960, -0.0528, -0.0286, -0.0884,  0.0214],
         [-0.0445,  0.0251,  0.0291,  0.0033, -0.1236]],

        [[-0.0502,  0.1278, -0.0566, -0.1221,  0.0385],
         [ 0.1289,  0.0842,  0.1103,  0.0374,  0.0353],
         [ 0.0124,  0.0753, -0.0150,  0.0220, -0.0303],
         ...,
         [-0.0197,  0.0386,  0.0428, -0.0563, -0.0295],
         [ 0.0788, -0.0783,  0.0876,  0.0235,  0.1071],
         [-0.0727, -0.0662,  0.0767,  0.0616, -0.0774]],

        [[ 0.1132, -0.0164, -0.0617,  0.0775, -0.0712],
         [ 0.0469, -0.0217,  0.1169,  0.0575, -0.1192],
         [-0.1204,  0.0910, -0.0542, -0.0739,  0.0933],
         ...,
         [ 0.0845, -0.0405,  0.0301,  0.1049,  0.1191],
         [-0.0213, -0.1174, -0.1137, -0.0713,  0.0823],
         [ 0.0046,  0.1154,  0.0838, -0.0695,  0.0771]]], requires_grad=True)

loss.data: >>>>>>>>>>>>>>>>>>>>>  tensor(1.6111)
loss.grad: >>>>>>>>>>>>>>>>>>>>>  None

[TIME DELAY KERNELS ######### MODEL WEIGHTS CONVTDNN1D After backward()] Parameter containing:
tensor([[[ 0.0464,  0.1265, -0.0264, -0.1229,  0.0058],
         [ 0.0775,  0.0588, -0.0519, -0.0402, -0.0509],
         [-0.1246,  0.1113,  0.0533, -0.0575,  0.0025],
         ...,
         [ 0.0988, -0.0924, -0.0649, -0.1004,  0.1123],
         [-0.0736,  0.0166, -0.1075, -0.0556,  0.0434],
         [ 0.0260,  0.0104, -0.0937, -0.0415,  0.1184]],

        [[-0.0510, -0.0046,  0.0368,  0.0506,  0.0200],
         [-0.0930, -0.0011, -0.0559,  0.0898,  0.0883],
         [ 0.0701, -0.1162, -0.1195, -0.0312,  0.0920],
         ...,
         [-0.0935,  0.0423,  0.0812, -0.0165,  0.0404],
         [-0.0876, -0.0143,  0.0941, -0.0526,  0.1147],
         [ 0.0211, -0.0701, -0.1012,  0.0667,  0.0170]],

        [[-0.1150,  0.0097, -0.0775,  0.0942,  0.0081],
         [-0.0979,  0.1238,  0.0203,  0.0908,  0.0247],
         [ 0.0847, -0.1289, -0.0263,  0.0161,  0.0947],
         ...,
         [ 0.0700, -0.1172, -0.0993,  0.0552,  0.0867],
         [-0.0777,  0.0926, -0.0298,  0.1178, -0.1251],
         [ 0.0866,  0.0364, -0.0255,  0.1266, -0.1166]],

        ...,

        [[ 0.1249, -0.0482,  0.0779, -0.0275, -0.1257],
         [ 0.0598, -0.0873, -0.0151, -0.0356, -0.1173],
         [ 0.0811,  0.0579,  0.1214, -0.0987, -0.1275],
         ...,
         [ 0.0394, -0.0167, -0.0050, -0.0497,  0.0201],
         [-0.0960, -0.0528, -0.0286, -0.0884,  0.0214],
         [-0.0445,  0.0251,  0.0291,  0.0033, -0.1236]],

        [[-0.0502,  0.1278, -0.0566, -0.1221,  0.0385],
         [ 0.1289,  0.0842,  0.1103,  0.0374,  0.0353],
         [ 0.0124,  0.0753, -0.0150,  0.0220, -0.0303],
         ...,
         [-0.0197,  0.0386,  0.0428, -0.0563, -0.0295],
         [ 0.0788, -0.0783,  0.0876,  0.0235,  0.1071],
         [-0.0727, -0.0662,  0.0767,  0.0616, -0.0774]],

        [[ 0.1132, -0.0164, -0.0617,  0.0775, -0.0712],
         [ 0.0469, -0.0217,  0.1169,  0.0575, -0.1192],
         [-0.1204,  0.0910, -0.0542, -0.0739,  0.0933],
         ...,
         [ 0.0845, -0.0405,  0.0301,  0.1049,  0.1191],
         [-0.0213, -0.1174, -0.1137, -0.0713,  0.0823],
         [ 0.0046,  0.1154,  0.0838, -0.0695,  0.0771]]], requires_grad=True)
for name, param in model.named_parameters():
                  print("Model Parameters",name, torch.isfinite(param.grad).all())
Output:
Model Parameters conv1d_tdnn1.kernel tensor(True)
Model Parameters conv1d_tdnn1.bias tensor(True)
Model Parameters bn1.weight tensor(True)
Model Parameters bn1.bias tensor(True)
Model Parameters conv1d_tdnn2.kernel tensor(True)
Model Parameters conv1d_tdnn2.bias tensor(True)
Model Parameters bn2.weight tensor(True)
Model Parameters bn2.bias tensor(True)
Model Parameters conv1d_tdnn3.kernel tensor(True)
Model Parameters conv1d_tdnn3.bias tensor(True)
Model Parameters bn3.weight tensor(True)
Model Parameters bn3.bias tensor(True)
Model Parameters conv1d_tdnn4.kernel tensor(True)
Model Parameters conv1d_tdnn4.bias tensor(True)
Model Parameters bn4.weight tensor(True)
Model Parameters bn4.bias tensor(True)
Model Parameters conv1d_tdnn5.kernel tensor(True)
Model Parameters conv1d_tdnn5.bias tensor(True)
Model Parameters bn5.weight tensor(True)
Model Parameters bn5.bias tensor(True)
Model Parameters conv1d_tdnn6.kernel tensor(True)
Model Parameters conv1d_tdnn6.bias tensor(True)
Model Parameters bn6.weight tensor(True)
Model Parameters bn6.bias tensor(True)
Model Parameters conv1d_tdnn7.kernel tensor(True)
Model Parameters conv1d_tdnn7.bias tensor(True)
Model Parameters bn7.weight tensor(True)
Model Parameters bn7.bias tensor(True)
Model Parameters conv1d_tdnn8.kernel tensor(True)
Model Parameters conv1d_tdnn8.bias tensor(True)
Model Parameters bn8.weight tensor(True)
Model Parameters bn8.bias tensor(True)
Model Parameters conv1d1.weight tensor(True)
Model Parameters conv1d1.bias tensor(True)
Model Parameters bn9.weight tensor(True)
Model Parameters bn9.bias tensor(True)
Model Parameters conv1d2.weight tensor(True)
Model Parameters conv1d2.bias tensor(True)
Model Parameters bn17.weight tensor(True)
Model Parameters bn17.bias tensor(True)
Model Parameters fc1.weight tensor(True)
Model Parameters fc1.bias tensor(True)

[TIME DELAY KERNELS ######### MODEL WEIGHTS after optim step] Parameter containing:
tensor([[[ 4.5370e-02,  1.2555e-01, -2.7370e-02, -1.2193e-01,  6.8261e-03],
         [ 7.8489e-02,  5.9769e-02, -5.0870e-02, -3.9160e-02, -4.9935e-02],
         [-1.2560e-01,  1.1027e-01,  5.2297e-02, -5.8546e-02,  1.4725e-03],
         ...,
         [ 9.9825e-02, -9.1410e-02, -6.5898e-02, -1.0138e-01,  1.1327e-01],
         [-7.2554e-02,  1.7553e-02, -1.0855e-01, -5.6650e-02,  4.4416e-02],
         [ 2.5022e-02,  9.3778e-03, -9.2745e-02, -4.2471e-02,  1.1742e-01]],

        [[-4.9999e-02, -3.5967e-03,  3.5824e-02,  4.9602e-02,  1.9006e-02],
         [-9.4040e-02, -1.1871e-04, -5.4942e-02,  9.0787e-02,  8.9262e-02],
         [ 7.1104e-02, -1.1721e-01, -1.2053e-01, -3.0214e-02,  9.2959e-02],
         ...,
         [-9.4487e-02,  4.1319e-02,  8.2165e-02, -1.5533e-02,  4.1357e-02],
         [-8.6648e-02, -1.3317e-02,  9.5130e-02, -5.1597e-02,  1.1569e-01],
         [ 2.2133e-02, -6.9081e-02, -1.0019e-01,  6.7732e-02,  1.8049e-02]],

        [[-1.1600e-01,  8.7238e-03, -7.8496e-02,  9.3240e-02,  7.1277e-03],
         [-9.6913e-02,  1.2276e-01,  1.9306e-02,  8.9796e-02,  2.3663e-02],
         [ 8.5686e-02, -1.2791e-01, -2.5250e-02,  1.7085e-02,  9.5654e-02],
         ...,
         [ 7.1008e-02, -1.1619e-01, -1.0031e-01,  5.6212e-02,  8.7727e-02],
         [-7.6723e-02,  9.3636e-02, -3.0839e-02,  1.1676e-01, -1.2606e-01],
         [ 8.5552e-02,  3.5442e-02, -2.6544e-02,  1.2565e-01, -1.1764e-01]],

        ...,

        [[ 1.2394e-01, -4.9221e-02,  7.6931e-02, -2.6519e-02, -1.2469e-01],
         [ 5.8770e-02, -8.8278e-02, -1.6056e-02, -3.6625e-02, -1.1831e-01],
         [ 8.2052e-02,  5.8877e-02,  1.2242e-01, -9.7689e-02, -1.2652e-01],
         ...,
         [ 4.0395e-02, -1.5654e-02, -3.9867e-03, -4.8731e-02,  1.9100e-02],
         [-9.5034e-02, -5.1830e-02, -2.7647e-02, -8.7446e-02,  2.2397e-02],
         [-4.3457e-02,  2.6086e-02,  3.0136e-02,  4.2744e-03, -1.2261e-01]],

        [[-4.9196e-02,  1.2683e-01, -5.7618e-02, -1.2310e-01,  3.7461e-02],
         [ 1.2793e-01,  8.5212e-02,  1.1134e-01,  3.8424e-02,  3.6294e-02],
         [ 1.1408e-02,  7.6318e-02, -1.3972e-02,  2.3047e-02, -2.9326e-02],
         ...,
         [-2.0711e-02,  3.7618e-02,  4.1771e-02, -5.7251e-02, -3.0511e-02],
         [ 7.7848e-02, -7.9281e-02,  8.8627e-02,  2.4537e-02,  1.0809e-01],
         [-7.1702e-02, -6.5161e-02,  7.7686e-02,  6.2586e-02, -7.6438e-02]],

        [[ 1.1423e-01, -1.5357e-02, -6.0655e-02,  7.8528e-02, -7.0191e-02],
         [ 4.5850e-02, -2.2736e-02,  1.1593e-01,  5.6516e-02, -1.2018e-01],
         [-1.2140e-01,  8.9994e-02, -5.5193e-02, -7.4852e-02,  9.2298e-02],
         ...,
         [ 8.3527e-02, -4.1535e-02,  2.9075e-02,  1.0388e-01,  1.1814e-01],
         [-2.0266e-02, -1.1640e-01, -1.1469e-01, -7.2341e-02,  8.1306e-02],
         [ 5.6036e-03,  1.1435e-01,  8.2755e-02, -7.0547e-02,  7.6113e-02]]],
       requires_grad=True)

For second batch

loss.data: >>>>>>>>>>>>>>>>>>>>>  tensor(1.5873)
loss.grad: >>>>>>>>>>>>>>>>>>>>>  None
for name, param in model.named_parameters():
                  print("Model Parameters",name, torch.isfinite(param.grad).all())
output:
Model Parameters conv1d_tdnn1.kernel tensor(False)
Model Parameters conv1d_tdnn1.bias tensor(False)
Model Parameters bn1.weight tensor(False)
Model Parameters bn1.bias tensor(False)
Model Parameters conv1d_tdnn2.kernel tensor(False)
Model Parameters conv1d_tdnn2.bias tensor(False)
Model Parameters bn2.weight tensor(False)
Model Parameters bn2.bias tensor(False)
Model Parameters conv1d_tdnn3.kernel tensor(False)
Model Parameters conv1d_tdnn3.bias tensor(False)
Model Parameters bn3.weight tensor(False)
Model Parameters bn3.bias tensor(False)
Model Parameters conv1d_tdnn4.kernel tensor(False)
Model Parameters conv1d_tdnn4.bias tensor(False)
Model Parameters bn4.weight tensor(False)
Model Parameters bn4.bias tensor(False)
Model Parameters conv1d_tdnn5.kernel tensor(False)
Model Parameters conv1d_tdnn5.bias tensor(False)
Model Parameters bn5.weight tensor(False)
Model Parameters bn5.bias tensor(False)
Model Parameters conv1d_tdnn6.kernel tensor(False)
Model Parameters conv1d_tdnn6.bias tensor(False)
Model Parameters bn6.weight tensor(False)
Model Parameters bn6.bias tensor(False)
Model Parameters conv1d_tdnn7.kernel tensor(False)
Model Parameters conv1d_tdnn7.bias tensor(False)
Model Parameters bn7.weight tensor(False)
Model Parameters bn7.bias tensor(False)
Model Parameters conv1d_tdnn8.kernel tensor(False)
Model Parameters conv1d_tdnn8.bias tensor(False)
Model Parameters bn8.weight tensor(False)
Model Parameters bn8.bias tensor(False)
Model Parameters conv1d1.weight tensor(False)
Model Parameters conv1d1.bias tensor(False)
Model Parameters bn9.weight tensor(False)
Model Parameters bn9.bias tensor(False)
Model Parameters conv1d2.weight tensor(False)
Model Parameters conv1d2.bias tensor(False)
Model Parameters bn17.weight tensor(False)
Model Parameters bn17.bias tensor(False)
Model Parameters fc1.weight tensor(True)
Model Parameters fc1.bias tensor(True)
[TIME DELAY KERNELS ######### MODEL WEIGHTS after optim step] Parameter containing:
tensor([[[nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         ...,
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan]],

        [[nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         ...,
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan]],

        [[nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         ...,
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan]],

        ...,

        [[nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         ...,
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan]],

        [[nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         ...,
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan]],

        [[nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         ...,
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan]]], requires_grad=True)

But When I use lr=1e-6,
The the weights /kernels seems to b not changing as shown below (but eventually becomes NaN after few epochs)

[TIME DELAY KERNELS #########] Parameter containing:
tensor([[[ 0.0079,  0.0298, -0.0579,  0.0080,  0.0100],
         [ 0.0063,  0.0655, -0.1080,  0.0079,  0.0759],
         [-0.1031,  0.1014,  0.0356,  0.0277,  0.0546],
         ...,
         [ 0.1165,  0.0836,  0.1217, -0.0938, -0.0234],
         [ 0.0288,  0.0526, -0.0533,  0.0694, -0.0221],
         [-0.0054,  0.0437,  0.0381, -0.0287, -0.0823]],

        [[-0.0571, -0.0404, -0.0376, -0.0070, -0.1035],
         [ 0.0561, -0.0900,  0.1084,  0.1146,  0.0830],
         [-0.1018,  0.0085,  0.0125, -0.0679, -0.1270],
         ...,
         [ 0.1034, -0.1151,  0.1013, -0.0041,  0.0461],
         [-0.0316,  0.0530,  0.0920,  0.0571,  0.0864],
         [-0.0450,  0.0983, -0.1215, -0.0478,  0.0072]],

        [[ 0.0602, -0.0368,  0.0419,  0.0913, -0.0916],
         [-0.0727,  0.0879, -0.1193, -0.0528, -0.1224],
         [ 0.0742, -0.0206, -0.0934, -0.0904,  0.0351],
         ...,
         [ 0.1246, -0.0076,  0.1060,  0.0746, -0.0602],
         [-0.0234,  0.0270,  0.0291, -0.0217,  0.1224],
         [-0.0776, -0.1226,  0.0947, -0.0233, -0.0771]],

        ...,

        [[-0.1039,  0.1198,  0.1153,  0.0678, -0.0050],
         [ 0.0494,  0.0957,  0.0433,  0.0602, -0.1126],
         [ 0.0615, -0.0724,  0.0260,  0.0691,  0.0008],
         ...,
         [ 0.0602, -0.0231,  0.0401,  0.0097,  0.0461],
         [-0.1059,  0.0715, -0.0322,  0.0102, -0.0236],
         [ 0.1195, -0.1168, -0.0832,  0.0411,  0.0460]],

        [[-0.0090,  0.0193,  0.0612,  0.0548,  0.1203],
         [ 0.0440,  0.0113, -0.1176,  0.0363,  0.0760],
         [-0.0311, -0.0469, -0.1203, -0.0161,  0.0886],
         ...,
         [-0.0672, -0.0296,  0.0936,  0.0948,  0.0258],
         [-0.1272, -0.1178,  0.0858, -0.0509, -0.0762],
         [-0.0768, -0.0323, -0.0619,  0.0687,  0.0263]],

        [[-0.0427,  0.0458,  0.0305, -0.0878, -0.0284],
         [ 0.1127,  0.1272, -0.0467, -0.0870, -0.0614],
         [-0.0112,  0.0184, -0.0048,  0.0264,  0.0600],
         ...,
         [ 0.0914,  0.1015, -0.1064,  0.0297,  0.0450],
         [ 0.0778,  0.0999, -0.0055,  0.0816, -0.1034],
         [-0.0717, -0.1141,  0.0757,  0.0213,  0.0423]]], requires_grad=True)
loss.data: >>>>>>>>>>>>>>>>>>>>>  tensor(1.6079)
loss.grad: >>>>>>>>>>>>>>>>>>>>>  None
[TIME DELAY KERNELS ######### MODEL WEIGHTS after optim step] Parameter containing:
tensor([[[ 0.0079,  0.0298, -0.0579,  0.0080,  0.0100],
         [ 0.0063,  0.0655, -0.1080,  0.0079,  0.0759],
         [-0.1031,  0.1014,  0.0356,  0.0277,  0.0546],
         ...,
         [ 0.1165,  0.0836,  0.1217, -0.0938, -0.0234],
         [ 0.0288,  0.0526, -0.0533,  0.0694, -0.0221],
         [-0.0054,  0.0437,  0.0381, -0.0287, -0.0823]],

        [[-0.0571, -0.0404, -0.0376, -0.0070, -0.1035],
         [ 0.0561, -0.0900,  0.1084,  0.1146,  0.0830],
         [-0.1018,  0.0085,  0.0125, -0.0679, -0.1270],
         ...,
         [ 0.1034, -0.1151,  0.1013, -0.0041,  0.0461],
         [-0.0316,  0.0530,  0.0920,  0.0571,  0.0864],
         [-0.0450,  0.0983, -0.1215, -0.0478,  0.0072]],

        [[ 0.0602, -0.0368,  0.0419,  0.0913, -0.0916],
         [-0.0727,  0.0879, -0.1193, -0.0528, -0.1224],
         [ 0.0742, -0.0206, -0.0934, -0.0904,  0.0351],
         ...,
         [ 0.1246, -0.0076,  0.1060,  0.0746, -0.0602],
         [-0.0234,  0.0270,  0.0291, -0.0217,  0.1224],
         [-0.0776, -0.1226,  0.0947, -0.0233, -0.0771]],

        ...,

        [[-0.1039,  0.1198,  0.1153,  0.0678, -0.0050],
         [ 0.0494,  0.0957,  0.0433,  0.0602, -0.1126],
         [ 0.0615, -0.0724,  0.0260,  0.0691,  0.0008],
         ...,
         [ 0.0602, -0.0231,  0.0401,  0.0097,  0.0461],
         [-0.1059,  0.0715, -0.0322,  0.0102, -0.0236],
         [ 0.1195, -0.1168, -0.0832,  0.0411,  0.0460]],

        [[-0.0090,  0.0193,  0.0612,  0.0548,  0.1203],
         [ 0.0440,  0.0113, -0.1176,  0.0363,  0.0760],
         [-0.0311, -0.0469, -0.1203, -0.0161,  0.0886],
         ...,
         [-0.0672, -0.0296,  0.0936,  0.0948,  0.0258],
         [-0.1272, -0.1178,  0.0858, -0.0509, -0.0762],
         [-0.0768, -0.0323, -0.0619,  0.0687,  0.0263]],

        [[-0.0427,  0.0458,  0.0305, -0.0878, -0.0284],
         [ 0.1127,  0.1272, -0.0467, -0.0870, -0.0614],
         [-0.0112,  0.0184, -0.0048,  0.0264,  0.0600],
         ...,
         [ 0.0914,  0.1015, -0.1064,  0.0297,  0.0450],
         [ 0.0778,  0.0999, -0.0055,  0.0816, -0.1033],
         [-0.0717, -0.1141,  0.0757,  0.0213,  0.0423]]], requires_grad=True)

When I use lr=1e-4, after few epochs , the kernels are NaN again

[TIME DELAY KERNELS #########] Parameter containing:
tensor([[[ 0.0136, -0.0752,  0.0020, -0.0965, -0.0131],
         [ 0.0635,  0.0099, -0.0959, -0.0854, -0.0325],
         [ 0.0286,  0.1149, -0.1007, -0.0021,  0.0026],
         ...,
         [-0.0003,  0.1163, -0.0021,  0.0870, -0.1104],
         [-0.0024,  0.0378, -0.0088, -0.0364, -0.0283],
         [-0.0635, -0.0541, -0.0629,  0.0534, -0.0200]],

        [[ 0.0144, -0.1141,  0.1095,  0.0856, -0.0459],
         [-0.0461,  0.0749, -0.0852,  0.0616, -0.0973],
         [-0.1217,  0.0756, -0.0908,  0.1120,  0.0292],
         ...,
         [-0.0219, -0.0834,  0.0487,  0.0458, -0.0246],
         [ 0.0370,  0.1010, -0.0155, -0.1216, -0.0037],
         [-0.1001, -0.0428, -0.0965,  0.0643,  0.1126]],

        [[-0.1100, -0.0551,  0.0958, -0.0667, -0.1217],
         [-0.0888,  0.0772,  0.1293,  0.0672, -0.1150],
         [-0.0810, -0.0997, -0.0865, -0.0070, -0.0275],
         ...,
         [-0.0956,  0.0067,  0.0814, -0.0341,  0.0146],
         [ 0.0009,  0.1233,  0.1066,  0.1227,  0.0112],
         [ 0.0494,  0.0964,  0.1136, -0.0609,  0.0278]],

        ...,

        [[ 0.0077,  0.0765,  0.0944, -0.0367, -0.1288],
         [ 0.0360,  0.1273,  0.0949, -0.1149, -0.0253],
         [-0.0977,  0.0820,  0.0507,  0.0170,  0.0841],
         ...,
         [ 0.0525,  0.1160, -0.0784, -0.0562,  0.0011],
         [ 0.0316,  0.0755,  0.0154,  0.0780, -0.0076],
         [-0.0287,  0.0956,  0.0803, -0.0724,  0.1140]],

        [[-0.1273, -0.0324, -0.0107,  0.0710,  0.0575],
         [-0.0106, -0.0954,  0.0516,  0.0549,  0.0785],
         [-0.0294, -0.0269,  0.0113,  0.0007, -0.0129],
         ...,
         [ 0.1290,  0.0674, -0.0439,  0.0626, -0.1003],
         [ 0.0332,  0.0544, -0.0148,  0.0892, -0.1052],
         [-0.0005,  0.0159, -0.0704, -0.0901,  0.1193]],

        [[-0.0171,  0.1152,  0.0823, -0.0116,  0.1046],
         [-0.0855, -0.1010, -0.0871,  0.0397, -0.0093],
         [ 0.0180,  0.0254,  0.1218, -0.0695, -0.0957],
         ...,
         [ 0.0437, -0.1261, -0.0877,  0.1015, -0.0517],
         [ 0.1105, -0.0370, -0.0133,  0.0120,  0.0652],
         [ 0.1052, -0.0098,  0.0578,  0.1058, -0.0507]]], requires_grad=True)

Model Parameters conv1d_tdnn1.kernel tensor(False)
Model Parameters conv1d_tdnn1.bias tensor(False)
Model Parameters bn1.weight tensor(False)
Model Parameters bn1.bias tensor(False)
Model Parameters conv1d_tdnn2.kernel tensor(False)
Model Parameters conv1d_tdnn2.bias tensor(False)
Model Parameters bn2.weight tensor(False)
Model Parameters bn2.bias tensor(False)
Model Parameters conv1d_tdnn3.kernel tensor(False)
Model Parameters conv1d_tdnn3.bias tensor(False)
Model Parameters bn3.weight tensor(False)
Model Parameters bn3.bias tensor(False)
Model Parameters conv1d_tdnn4.kernel tensor(False)
Model Parameters conv1d_tdnn4.bias tensor(False)
Model Parameters bn4.weight tensor(False)
Model Parameters bn4.bias tensor(False)
Model Parameters conv1d_tdnn5.kernel tensor(False)
Model Parameters conv1d_tdnn5.bias tensor(False)
Model Parameters bn5.weight tensor(False)
Model Parameters bn5.bias tensor(False)
Model Parameters conv1d_tdnn6.kernel tensor(False)
Model Parameters conv1d_tdnn6.bias tensor(False)
Model Parameters bn6.weight tensor(False)
Model Parameters bn6.bias tensor(False)
Model Parameters conv1d_tdnn7.kernel tensor(False)
Model Parameters conv1d_tdnn7.bias tensor(False)
Model Parameters bn7.weight tensor(False)
Model Parameters bn7.bias tensor(False)
Model Parameters conv1d_tdnn8.kernel tensor(False)
Model Parameters conv1d_tdnn8.bias tensor(False)
Model Parameters bn8.weight tensor(False)
Model Parameters bn8.bias tensor(False)
Model Parameters conv1d1.weight tensor(False)
Model Parameters conv1d1.bias tensor(False)
Model Parameters bn9.weight tensor(False)
Model Parameters bn9.bias tensor(False)
Model Parameters conv1d2.weight tensor(False)
Model Parameters conv1d2.bias tensor(False)
Model Parameters bn17.weight tensor(False)
Model Parameters bn17.bias tensor(False)
Model Parameters fc1.weight tensor(True)
Model Parameters fc1.bias tensor(True)
[TIME DELAY KERNELS ######### MODEL WEIGHTS after optim step] Parameter containing:
tensor([[[nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         ...,
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan]],

        [[nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         ...,
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan]],

        [[nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         ...,
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan]],

        ...,

        [[nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         ...,
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan]],

        [[nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         ...,
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan]],

        [[nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         ...,
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan]]], requires_grad=True)

Plus Is this Normalisation okay for audio data

transformed_data = transforms.Compose([        
        lambda input: input.astype(np.float64)/np.max(abs(input)),  # Rescaling -1 to 1        
        lambda input: get_mfcc_features(input, sr, n_mfcc, n_fft, hop_length),       
        lambda input: normalize(input),        
        lambda input: torch.Tensor(input)       
        ])
def normalize(input):    
    input += 1e-5  #For Numerical Stability
    stdv = np.std(input)
    input = (input - np.mean(input)) #/ np.std(input)        #0 mean 1 std
    input = input / stdv #np.max(abs(input))
    if np.isnan(np.sum(input)):
        print("[Nan Values in Normalize is ::]", np.isnan(np.sum(input)))
    return input

It seems that the gradient in bn17 gets the first NaN values, while the last layer (fc1) seems to have valid gradients.
You could check, how large the gradients in fc1 are and check, why they might be overflowing in the bn17 layer.
It also seems that a lower learning rate delays the first NaN result, which might also point to a high magnitude in some values.

Could you also check the stdv in your normalize method?
Since you are not dividing with an eps, the output might get huge numbers, if stdv is small or close to zero.

@ptrblck,
I fixed the NaN issue, the problem was while computing the statistics, before feeding it to the FC layer, I was computing the statistics across Time Dimension not across the feature dimension. And This was giving the NaN Values

Thank you so much @ptrblck for your feedback

1 Like

I’m glad you’ve solved it.
What was the issue in calculating the stats in another dimension? Did the std get close to zero and was thus blowing up the normalized inputs?

2 Likes

Thank you for your help as well.
Yes i think this was the issue.

1 Like

how to solve it, please?