Gradients are becoming Zero, Weights not getting updated, Model not training

Hi,

Basically, I have built a NN, specifically an A2C, with several layers… Straight after the first update the gradients become zero. I tried increasing the learning rate from 1e-3 all the way 100 incrementally but that didn’t resolve the problem… Not sure what to do… Appreciate some help… Below the code I have placed the gradients post backprop. Thanks again for any help!

I used the following code to print out the params. Considering the character limits, I had to reduce the printed params… Basically every other layer has 0 values, you can see most of it in the provided text.
count = 0

    for p in self.model.parameters():
        print("################# LAYER", count)
        print(p)
        count += 1

My code is as follows:
class ActorCritic(nn.Module):
def init(self, state_size, action_size, hidden_size, mini_batch_size, num_channels = 3, std = 0.0, dropout_rate = 0.5):
super(ActorCritic, self).init()

    conv_out_channels = 1024

    # Convolution layers
    self.conv1 = nn.Sequential(
        nn.Conv1d(in_channels = state_size, out_channels = 64, kernel_size = 3, stride = 1, padding = 2),
        nn.BatchNorm1d(64),
        nn.ReLU(),
        nn.Dropout(dropout_rate),
        nn.MaxPool1d(kernel_size = 2, stride = 2))
    
    self.conv2 = nn.Sequential(
        nn.Conv1d(in_channels = 64, out_channels = 128, kernel_size = 3, stride = 1, padding = 2),
        nn.BatchNorm1d(128),
        nn.ReLU(),
        nn.Dropout(dropout_rate),
        nn.MaxPool1d(kernel_size = 2, stride = 2))
    
    self.conv3 = nn.Sequential(
        nn.Conv1d(in_channels = 128, out_channels = 256, kernel_size = 3, stride = 1, padding = 2),
        nn.BatchNorm1d(256),
        nn.ReLU(),
        nn.Dropout(dropout_rate),
        nn.MaxPool1d(kernel_size = 2, stride = 2))
    
    self.conv4 = nn.Sequential(
        nn.Conv1d(in_channels = 256, out_channels = 512, kernel_size = 3, stride = 1, padding = 2),
        nn.BatchNorm1d(512),
        nn.ReLU(),
        nn.Dropout(dropout_rate),
        nn.MaxPool1d(kernel_size = 2, stride = 2))
    
    self.conv5 = nn.Sequential(
        nn.Conv1d(in_channels = 512, out_channels = conv_out_channels, kernel_size = 3, stride = 1, padding = 2),
        nn.BatchNorm1d(conv_out_channels),
        nn.ReLU(),
        nn.Dropout(dropout_rate),
        nn.MaxPool1d(kernel_size = 2, stride = 2))


    self.conv_out_size = conv_out_channels * (3 + mini_batch_size - 1)       # basically came up with some arbitary figures that seem to work

    
    self.process_out = 100

    # Fully connected layers
    self.fc1 = nn.Sequential(
        nn.Linear(self.conv_out_size, 1024),
        nn.InstanceNorm1d(1024),
        nn.LeakyReLU(),
        nn.Dropout(dropout_rate)
    )

    self.fc2 = nn.Sequential(
        nn.Linear(1024, action_size),
        nn.InstanceNorm1d(action_size),
        nn.LeakyReLU(),
        nn.Dropout(dropout_rate)
    )

    self.fc3 = nn.Sequential(
        nn.Linear(action_size + action_size, self.process_out),                             # add the weights from the previous timestep    
        nn.InstanceNorm1d(action_size),
        nn.LeakyReLU(),
        nn.Dropout(dropout_rate)
    )
    
    # self.fc1 = nn.Linear(self.conv_out_size, 500)      
    # self.fc2 = nn.Linear(500, action_size)                     
    # self.fc3 = nn.Linear(action_size + action_size, self.process_out)                     # add the weights from the previous timestep    

    self.critic = nn.Sequential(
        nn.Linear(self.process_out, hidden_size),
        nn.InstanceNorm1d(hidden_size),
        nn.LeakyReLU(),
        nn.Linear(hidden_size, 1)
    )
    
    self.actor = nn.Sequential(
        nn.Linear(self.process_out, hidden_size),
        nn.InstanceNorm1d(hidden_size),
        nn.LeakyReLU(),
        nn.Linear(hidden_size, action_size),
    )

    self.log_std = nn.Parameter(torch.ones(1, action_size) * std)
    self.apply(init_weights)
    print("############# LOG STD", self.log_std)
    

def forward(self, state, prev_action, debug = False):
    if debug:
        print("##### STATE", state.shape)
        
    x = self.conv1(state)
    if debug:
        print("##### CONV 1", x.shape)

    x = self.conv2(x)
    if debug:
        print("##### CONV 2", x.shape)
    
    x = self.conv3(x)
    if debug:
        print("##### CONV 3", x.shape)

    x = self.conv4(x)
    if debug:
        print("##### CONV 4", x.shape)

    x = self.conv5(x)
    if debug:
        print("##### CONV 5", x.shape)

    x = x.view(-1, self.conv_out_size)
    # x = x.view(-1)                                  # converts matrix into vector
    if debug:
        print("##### VIEW", x.shape)


    x = self.fc1(x)
    if debug:
        print("##### FC 1", x.shape)

    x = self.fc2(x)
    x = torch.cat((x.squeeze(0), prev_action), dim = 0)        # concats prev_actions to x
    if debug:
        print("##### FC 2", x.shape, "PREV ACTION", prev_action.shape)


    x = self.fc3(x.unsqueeze(0))
    if debug:
        print("##### FC 3", x.shape)

    value = self.critic(x)
    if debug:
        print("##### CRITIC VALUE", value.shape)

    mu = self.actor(x)
    mu.unsqueeze_(0)
    if debug:
        print("##### ACTOR MU", mu.shape)

    std = self.log_std.exp().expand_as(mu)
    if debug:
        print("## STD ", std.shape)

    dist = Normal(mu, std)
    sample = dist.sample()
    if debug:
        print("################# DIST ", sample)
    return dist, value, sample

def get_action(self, sample):
    action = F.softmax(sample, dim=1)
    return np.ndarray.flatten(action.cpu().numpy())

################# LAYER 0
Parameter containing:
tensor([[4.5298, 3.2061, 4.3238, 3.8674, 3.9267, 3.8688, 3.2649, 4.1748, 1.7521,
2.5565, 4.4183, 5.3011, 2.0775, 4.7228]], device=‘cuda:0’,
requires_grad=True)
################# LAYER 1
Parameter containing:
tensor([[[ 8.1197e-13, -4.2475e-13, 9.3603e-13],
[-1.5869e-13, -1.0279e-12, 9.4750e-13],
[ 1.0955e-12, -9.1328e-13, 8.6053e-13],
…,
[ 2.0737e-13, 5.3980e-13, -6.6840e-13],
[ 2.5901e-13, 1.1984e-12, -1.2454e-15],
[-8.9355e-13, -7.5358e-13, -1.0472e-12]],

    [[ 9.8930e-13,  1.6604e-12,  4.2620e-13],
     [ 3.9799e-13,  1.1449e-12, -2.3047e-13],
     [-1.2045e-12,  9.5506e-13, -9.2304e-13],
     ...,
     [ 8.5082e-13,  1.5487e-13, -1.1870e-12],
     [-1.0039e-12,  8.0307e-13, -1.4503e-13],
     [ 4.4152e-13,  8.1767e-13,  9.3908e-13]],

    [[-1.3156e-12, -1.3375e-13,  6.9104e-13],
     [-1.5915e-12,  9.0684e-13,  1.1879e-12],
     [-1.1602e-12, -8.8697e-13, -9.8110e-13],
     ...,
     [ 1.0038e-12,  1.0981e-12, -1.1023e-12],
     [-8.0543e-13,  1.2475e-12, -5.5562e-13],
     [ 8.1007e-13, -8.7900e-13, -3.8334e-13]],

    ...,

    [[-4.3846e-13,  1.7409e-12, -1.0247e-12],
     [-1.3925e-12,  2.3277e-13,  5.9578e-13],
     [ 3.2765e-13, -8.6728e-13, -1.2308e-12],
     ...,
     [-7.9005e-13, -3.9007e-13, -1.0014e-12],
     [-9.0714e-13,  1.1352e-12, -1.1337e-12],
     [ 5.3597e-13,  1.2971e-12, -1.0426e-12]],

    [[-4.4151e-13, -2.0322e-13,  4.7584e-13],
     [-5.6349e-13,  1.8669e-13, -1.4227e-13],
     [-1.2109e-12,  7.0184e-13,  5.2813e-13],
     ...,
     [-7.0628e-13, -1.8449e-12,  8.4682e-13],
     [-1.4865e-12, -5.8730e-13, -6.3716e-13],
     [-3.2126e-13,  9.3327e-13, -6.5488e-13]],

    [[-1.6806e-12, -6.0561e-13,  9.1428e-13],
     [-1.4489e-12, -1.0895e-12, -1.3648e-12],
     [-7.0027e-13,  6.7084e-13, -2.5034e-13],
     ...,
     [ 3.3368e-13, -1.1739e-12,  1.1714e-12],
     [-3.1215e-13, -1.0706e-12, -1.2255e-12],
     [ 8.9367e-13,  6.9952e-13, -9.6102e-13]]], device='cuda:0',
   requires_grad=True)

################# LAYER 2
Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
device=‘cuda:0’, requires_grad=True)
################# LAYER 3
Parameter containing:
tensor([ 4.7898e-12, -4.7675e-13, -1.6134e-12, -1.0257e-12, 1.5734e-12,
-1.9424e-12, -2.2574e-12, -4.8935e-12, -1.0122e-11, -5.7205e-12,
-6.0709e-12, 5.0607e-13, -4.6406e-12, -8.9328e-12, 2.4282e-14,
-1.0146e-11, 3.5188e-13, -1.1911e-11, 5.5329e-12, 3.9869e-13,
-2.7697e-12, -1.2245e-12, 1.6598e-12, 7.1321e-13, 4.4886e-12,
-4.6125e-12, 4.4556e-12, -9.5881e-12, 1.2254e-12, -4.8361e-12,
-6.2434e-13, 6.7550e-12, 1.3284e-12, -4.7411e-12, -2.0949e-12,
3.5917e-12, -1.3278e-12, 2.7495e-12, 3.4560e-13, 2.8214e-13,
-9.8566e-12, 1.0246e-13, 4.7840e-13, 1.7597e-12, 4.0107e-12,
-3.0660e-12, -1.0735e-12, 5.6410e-12, 1.0390e-12, -7.7697e-12,
-4.1233e-12, -9.4497e-13, 4.9999e-12, -1.9606e-14, 1.2165e-12,
-2.9938e-12, 1.4949e-12, 3.1029e-12, -7.6224e-12, -8.0108e-12,
-6.5905e-12, -9.9776e-15, 6.0203e-12, 7.3820e-12], device=‘cuda:0’,
requires_grad=True)
################# LAYER 4
Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
device=‘cuda:0’, requires_grad=True)
################# LAYER 5
Parameter containing:
tensor([[[-9.1749e-13, 6.9991e-13, -4.4765e-13],
[-2.2080e-13, 8.4679e-13, -8.0278e-13],
[ 4.4775e-13, 3.7334e-13, -7.7496e-13],
…,
[-9.1806e-13, -3.2900e-13, 9.5355e-13],
[ 5.5482e-13, -1.1580e-12, -6.6726e-14],
[ 9.2452e-13, -3.0767e-13, -2.4828e-13]],

    [[-3.3471e-13, -1.0614e-12,  9.6939e-13],
     [ 9.5546e-13,  1.1446e-12,  7.0745e-13],
     [-7.6769e-13, -4.5985e-13,  8.6706e-13],
     ...,
     [ 1.2287e-13,  9.2356e-13, -3.3413e-13],
     [ 4.7191e-13,  2.9365e-13, -5.8289e-13],
     [-5.8498e-14,  1.1974e-13,  9.0870e-13]],

    [[-8.7646e-13, -8.4400e-13, -6.7215e-13],
     [ 1.6161e-13,  2.2019e-13,  5.6360e-13],
     [-3.2442e-13, -7.8114e-13, -1.5734e-13],
     ...,
     [-1.0273e-12, -1.2322e-12,  9.6540e-13],
     [-5.0431e-13, -4.8332e-13,  8.1761e-13],
     [-7.5004e-14,  3.1944e-13,  4.2709e-13]],

    ...,

    [[-4.3790e-13, -1.1087e-13,  7.3244e-13],
     [-4.6267e-13,  5.0530e-13, -1.1217e-12],
     [ 4.5589e-13, -9.8030e-13,  6.2932e-13],
     ...,
     [ 1.7241e-14,  6.7941e-13, -7.5378e-13],
     [ 5.3053e-13,  1.0892e-12,  8.8375e-13],
     [-7.9630e-13, -1.6237e-13,  4.7836e-13]],

    [[-3.3527e-13,  1.3322e-13, -1.2507e-12],
     [ 1.0321e-12, -5.4462e-13,  5.2212e-13],
     [ 7.7059e-13, -7.3867e-13, -9.9665e-13],
     ...,
     [-9.5555e-13,  5.2555e-13, -2.1556e-13],
     [ 6.2504e-13, -8.1202e-13, -4.3386e-13],
     [ 7.6186e-13, -4.3495e-14, -4.4856e-13]],

    [[ 3.2584e-13,  8.9820e-13, -4.9038e-15],
     [-7.9733e-13,  8.1976e-13, -1.2340e-12],
     [-9.1442e-13, -2.3529e-13, -8.5760e-13],
     ...,
     [ 8.6392e-13,  1.6917e-13,  3.8579e-13],
     [-8.8679e-13,  2.9397e-13, -9.3440e-14],
     [ 1.6388e-14, -1.1082e-12,  5.6219e-13]]], device='cuda:0',
   requires_grad=True)

################# LAYER 6
Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0.], device=‘cuda:0’, requires_grad=True)
################# LAYER 7
Parameter containing:
tensor([-6.5277e-12, -5.0030e-12, -4.7853e-12, -7.9111e-12, 1.7474e-12,
-5.4624e-12, -1.5666e-12, 1.1323e-12, 2.3214e-12, 5.5100e-12,
-1.1172e-11, -4.7060e-13, 3.5249e-12, -1.4258e-12, -4.9131e-12,
3.6777e-13, -7.4106e-12, 1.1617e-12, 1.2135e-12, 5.4477e-12,
3.5162e-12, -2.7667e-12, 1.2136e-12, -3.2840e-13, -7.5159e-12,
-9.7880e-12, -3.8751e-12, 2.9533e-12, 1.1442e-11, -3.2621e-12,
3.4526e-12, 9.6877e-12, 2.1575e-12, 1.9184e-12, 6.0403e-12,
-1.8376e-12, 5.5212e-12, 1.8269e-12, 1.0489e-11, -1.9555e-12,
-6.2992e-12, -3.0261e-12, 6.5195e-12, 2.1130e-12, -1.9235e-12,
2.2757e-12, -1.7911e-12, -8.7390e-12, -8.7944e-12, 2.4806e-12,
6.5045e-12, 2.7929e-12, 4.7111e-12, 9.3810e-13, -3.5074e-12,
1.8981e-12, 1.0765e-11, 4.1251e-12, 1.1413e-12, -7.7359e-13,
-1.0038e-12, 2.2544e-12, 6.5235e-12, 1.0479e-12, -8.5266e-13,
-9.5834e-12, 9.7947e-12, 1.4685e-12, 8.3243e-13, -5.2059e-12,
-4.3357e-12, -2.3015e-12, -3.9781e-12, 2.5333e-13, 1.0566e-12,
4.5856e-13, 2.6240e-12, 4.7677e-12, 8.9711e-13, -7.5163e-13,
4.7525e-12, -2.0269e-12, 6.5637e-12, -5.5774e-12, 1.0567e-11,
7.2593e-12, -1.4730e-12, 3.7043e-13, -8.6711e-12, -8.5086e-12,
-2.1155e-12, 2.8729e-12, -6.0512e-12, -7.5220e-13, -5.6717e-12,
1.4139e-12, -5.5303e-12, 2.5870e-12, -2.8270e-12, -4.4242e-12,
7.4383e-12, 1.2639e-14, -4.1597e-13, 3.3919e-12, -7.5453e-12,
-2.8660e-12, 4.6736e-12, -9.4438e-12, 4.2355e-12, 3.6640e-12,
4.4290e-12, -7.0120e-12, 9.0853e-12, 1.0715e-12, -9.3707e-12,
6.2837e-12, -5.3541e-12, -2.8448e-12, -9.5911e-12, -1.7640e-12,
-5.4320e-12, -7.6288e-12, -4.3415e-12, 5.1918e-13, 4.7735e-12,
4.1149e-12, 3.7701e-12, 9.2635e-13], device=‘cuda:0’,
requires_grad=True)
################# LAYER 8
Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0.], device=‘cuda:0’, requires_grad=True)
################# LAYER 9
Parameter containing:
tensor([[[ 5.1566e-13, -4.7444e-13, -6.9026e-13],
[-3.4053e-13, 7.9233e-13, 1.6796e-13],
[-1.0529e-13, 6.8990e-13, 1.1773e-12],
…,
[-9.5066e-13, 7.0233e-13, 2.6340e-13],
[-6.5295e-13, -1.3386e-13, -7.0351e-13],
[-8.5952e-13, -5.0669e-13, 8.3362e-13]],

    [[ 4.6532e-13, -6.1337e-13, -9.9642e-13],
     [-1.3463e-13,  8.3161e-13,  9.1906e-13],
     [-1.2113e-12, -9.1594e-13,  4.1988e-13],
     ...,
     [ 4.9874e-13,  5.4375e-13, -3.3214e-13],
     [ 8.4086e-13,  2.4088e-13,  9.4330e-13],
     [-4.8954e-13, -6.6836e-13,  7.8116e-13]],

    [[ 5.9998e-13, -1.0423e-12,  7.3040e-13],
     [-7.0307e-13,  8.4661e-13, -7.3966e-13],
     [ 5.1666e-13,  5.6347e-14,  2.9479e-13],
     ...,
     [ 4.0156e-16, -1.1715e-12, -8.6710e-13],
     [-9.2976e-13,  8.8033e-13, -3.1712e-13],
     [ 8.4956e-13, -1.0592e-12,  8.0408e-14]],

    ...,

    [[ 5.9349e-13,  8.7567e-13, -5.2193e-13],
     [ 5.0274e-13,  5.5871e-13,  3.9201e-13],
     [-7.5430e-13,  4.1236e-13, -9.9120e-13],
     ...,
     [ 3.1279e-13, -9.1984e-13, -1.0392e-12],
     [ 1.2115e-12,  1.0384e-12,  3.2271e-13],
     [-4.8747e-13,  8.3519e-13,  4.9309e-14]],

    [[ 3.4600e-13, -4.1635e-13, -9.2472e-13],
     [ 9.0311e-13, -2.7453e-13,  1.0157e-12],
     [-1.1334e-13, -8.3661e-13,  3.3139e-13],
     ...,
     [ 3.7354e-13, -8.5102e-13,  8.7721e-13],
     [ 1.1851e-12,  3.9697e-13, -4.9677e-13],
     [ 8.2270e-13, -1.3526e-13, -6.3453e-13]],

    [[ 8.2896e-13,  5.1444e-13,  9.4765e-13],
     [-5.8019e-13,  1.2139e-12, -1.0154e-12],
     [-5.5610e-14, -9.2030e-13,  1.1360e-13],
     ...,
     [-7.4880e-13, -9.0875e-13, -4.8534e-13],
     [ 7.2151e-13,  4.7504e-13, -8.7364e-13],
     [-8.4486e-13, -4.3754e-14,  5.2481e-13]]], device='cuda:0',
   requires_grad=True)

################# LAYER 10
Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
device=‘cuda:0’, requires_grad=True)
################# LAYER 11
Parameter containing:
tensor([ 5.1689e-12, 2.7563e-12, -1.0986e-11, -4.9929e-12, -5.9312e-14,
9.1750e-13, -3.9952e-13, -6.7056e-12, 1.4211e-12, 4.1140e-13,
-7.7132e-13, -7.5757e-12, 1.1301e-11, 3.6119e-12, -9.3969e-12,
-4.5382e-12, 2.9386e-12, 2.2074e-14, -4.0622e-13, -9.7997e-13,
-3.3614e-12, 6.3079e-12, -7.6360e-12, 2.6342e-12, 1.0627e-11,
7.8578e-12, 9.9241e-12, 1.2658e-12, -5.1541e-12, 8.9738e-13,
8.7090e-13, 2.0575e-12, 2.2396e-12, 1.1155e-13, -3.3117e-12,
-2.7834e-12, 4.8563e-12, -2.0356e-12, -5.4097e-13, 8.4791e-12,
2.2872e-12, -2.9220e-12, 9.1569e-12, -9.3902e-12, -1.0786e-11,
8.0479e-13, 1.8477e-14, 3.0034e-13, 4.4619e-12, 2.8540e-12,
-2.3062e-12, -7.1166e-12, 3.1356e-12, 6.3689e-12, -6.1431e-13,
-1.8086e-12, 5.0431e-12, -2.2746e-12, 4.0181e-12, -5.2324e-12,
-1.1323e-11, 8.5463e-13, 8.6182e-12, -8.3033e-13, 9.6582e-12,
1.0094e-12, 1.7173e-12, 5.0689e-12, -8.6555e-12, 5.6419e-12,
-4.0032e-12, 1.1285e-12, 1.7110e-12, -5.9280e-12, -8.7870e-12,
4.1830e-12, 3.2554e-12, 2.1135e-12, -1.7908e-12, 3.4034e-12,
-1.6567e-12, -2.1318e-12, -1.2069e-12, 9.0663e-12, -1.2774e-12,
4.7610e-12, -1.9891e-12, -7.7957e-12, 8.4528e-12, 5.6940e-12,
-9.4517e-12, -5.2855e-12, -2.1793e-12, -7.0676e-12, 3.6789e-12,
-3.5174e-12, -4.7724e-12, 4.8315e-12, -9.1913e-13, -8.7613e-13,
6.2217e-13, -4.5828e-12, -6.1973e-12, 1.8257e-12, -2.8081e-12,
-2.5123e-12, -5.9462e-12, 8.9773e-12, 6.4044e-12, -2.4790e-12,
5.6777e-12, -3.6182e-13, -2.0754e-13, 7.2739e-12, -4.3675e-12,
-1.0837e-11, -1.4829e-12, 2.4258e-12, -8.1577e-16, 3.0472e-12,
3.9120e-12, 4.0820e-12, 8.9220e-13, 2.3265e-12, -4.1897e-12,
3.6498e-12, -6.0129e-12, -1.3162e-12, -4.5501e-12, 1.6370e-12,
4.1894e-13, 2.3374e-12, 1.6148e-12, 3.1469e-12, -2.5502e-12,
-5.0040e-12, 2.6144e-12, -7.7184e-13, 3.5351e-12, 8.2813e-12,
6.9857e-13, -1.8105e-12, -7.0587e-13, -1.7778e-12, 5.8004e-12,
8.6235e-12, -4.7011e-12, 1.3301e-12, -5.6521e-13, 4.0904e-12,
-9.8805e-12, -1.0641e-12, -4.7594e-12, 1.5009e-12, 1.0451e-11,
9.9727e-13, -8.5178e-13, 7.8408e-12, -7.7304e-12, 5.4389e-12,
-7.5599e-13, 1.7698e-12, -1.2480e-12, -1.0697e-12, -5.3639e-12,
4.2928e-12, 4.6260e-12, -3.6102e-12, 7.4131e-12, 4.5939e-12,
-2.0450e-12, -3.6531e-13, 5.5571e-12, -2.2810e-13, 4.6733e-12,
-9.1133e-13, -4.2755e-12, 1.7801e-12, 2.5334e-13, 7.6334e-12,
6.6379e-13, 3.3144e-12, 2.6772e-12, -1.5503e-12, -4.7279e-14,
1.1554e-12, -2.6879e-12, -1.2102e-12, -6.3262e-12, 1.6018e-12,
3.9160e-12, -3.8185e-12, 4.9529e-12, -5.0967e-13, 4.1543e-12,
3.0592e-12, -1.0954e-11, -2.0437e-12, -3.3053e-12, 2.1905e-12,
-6.3636e-13, -8.7177e-12, -3.3186e-12, 4.4105e-12, 7.6706e-12,
8.4479e-12, -7.7513e-13, -1.8882e-12, 1.0554e-12, 2.2405e-12,
5.8512e-12, -1.1127e-12, -3.4239e-12, 8.0291e-13, -3.6705e-12,
-7.5360e-13, 7.2129e-13, 2.7786e-12, 9.8050e-12, 8.6499e-13,
1.0342e-12, -2.9787e-12, -4.8941e-13, 5.8866e-13, 2.1878e-12,
1.6786e-12, 3.7063e-12, -7.1629e-12, -2.3309e-12, -8.6171e-12,
5.2345e-13, 4.8642e-13, -2.0965e-12, 6.0043e-12, -4.6277e-12,
9.6424e-12, -8.9239e-13, 8.1678e-13, 1.9151e-12, -5.6445e-12,
-8.3018e-12, -4.8596e-12, -1.0995e-11, 7.0360e-12, -7.1324e-12,
2.7213e-12, -2.3374e-12, 8.8526e-12, -3.0006e-12, 8.7732e-12,
3.6859e-12, -2.6599e-13, 1.0468e-12, 4.6716e-12, 4.4367e-12,
6.7195e-12], device=‘cuda:0’, requires_grad=True)
################# LAYER 12
Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
device=‘cuda:0’, requires_grad=True)
################# LAYER 13
Parameter containing:
tensor([[[ 5.0845e-13, -3.5231e-13, -1.1128e-12],
[-1.1493e-13, -8.1201e-13, -6.4135e-13],
[-5.5845e-13, 1.2017e-12, 1.1682e-12],
…,
[ 7.7372e-13, -5.8116e-13, -1.0106e-12],
[-9.2365e-13, -1.1941e-12, 6.0454e-13],
[-6.3145e-13, -9.5146e-14, 3.9734e-13]],

    [[ 2.8268e-13, -3.0826e-13,  9.3267e-13],
     [ 2.3651e-13,  2.9566e-13,  1.1273e-12],
     [-6.9206e-13, -3.8371e-14, -7.9597e-13],
     ...,
     [-9.0061e-13, -2.6643e-13,  8.3692e-13],
     [ 2.5696e-13, -1.0952e-12, -6.6256e-13],
     [ 9.8497e-13,  5.8217e-13,  9.7263e-13]],

    [[-1.0641e-12,  7.5107e-13,  7.8477e-13],
     [-8.9476e-13, -1.0441e-14,  8.3987e-13],
     [ 1.3852e-14, -8.2954e-13, -2.8288e-13],
     ...,
     [-4.8893e-13,  1.1174e-12, -8.6219e-13],
     [-8.1246e-13,  2.0990e-13, -9.4476e-13],
     [-4.6247e-13,  2.0494e-13,  1.4481e-13]],

    ...,

    [[-1.1332e-13, -1.1840e-12, -9.7236e-13],
     [ 6.4163e-13,  1.0622e-12,  6.8038e-13],
     [-8.0847e-13,  1.5823e-13, -8.4432e-13],
     ...,
     [-1.1394e-12, -2.7535e-13,  8.2736e-13],
     [-1.2143e-12,  5.6658e-13, -8.0282e-13],
     [-8.5205e-13, -7.8970e-13, -3.0932e-14]],

    [[ 1.0462e-13, -2.5782e-14, -1.1952e-13],
     [ 1.0311e-12, -8.5143e-13, -9.3092e-13],
     [ 1.1125e-12,  1.0688e-12, -1.0421e-13],
     ...,
     [ 1.4973e-13, -9.4634e-13,  3.2734e-13],
     [-5.2586e-13, -6.3077e-13,  9.9838e-13],
     [-8.1737e-13,  2.2873e-13, -8.0070e-13]],

    [[-5.6898e-13, -5.1653e-13,  1.1437e-12],
     [ 1.2054e-12,  7.7903e-13, -8.6264e-13],
     [-6.6789e-13, -9.6224e-13,  2.9781e-13],
     ...,
     [-4.8921e-13,  5.6288e-14, -7.4274e-14],
     [ 8.9793e-13,  4.2910e-13,  8.0503e-13],
     [ 6.6987e-13,  7.3716e-13, -1.8283e-13]]], device='cuda:0',
   requires_grad=True)

################# LAYER 14
Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0.], device=‘cuda:0’, requires_grad=True)

Would be hard to say without seeing the loss function.

I figured it out… Thanks for responding to my post

What was the issue that you figured out?