Gradients saturating to zero without learning

I have resnet-18 which takes in an image , a cnn vocal encoder which takes in numpy vector . The outputs from them are concatenated and stacked into a sequence. This sequence is passed to an LSTM which gives me the output and then back-propagation through the entire above network.
The issue is that the gradients quickly decrease to very low values like 10e-10 or 10e-11 after around only 50-100 sequences(Each sequence consists of 150 outputs from resnet and the vocal encoder). And there is even no learning . The training accuracy is equivalent to picking a random number and the loss never goes down. If I shutoff the resnet or the vocal encoder separately with the LSTM the same is the case.
After looking into the data and the inputs I am pretty sure there is a bug in my code and some component is not being back propagated through. But I am not able to figure out which one. Help is appreciated !

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import torchvision.models as models
from matplotlib import pyplot as plt
import numpy as np
import h5py
from PIL import Image
from sklearn.externals import joblib
import shutil
import os
import random
import pickle
import time
import gc
'-----------------------------------------------------Vocal Net--------------------------------------------------------'

class VocalNet(nn.Module):
    def __init__(self):
        super(VocalNet, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=20, kernel_size=40, stride=1, padding=20)
        self.conv2 = nn.Conv1d(in_channels=20, out_channels=40, kernel_size=40, stride=1, padding=20)

    def forward(self, vocal_input):
        x = F.leaky_relu(F.max_pool1d(self.conv1(vocal_input), 2))
        x = F.leaky_relu(F.max_pool1d(self.conv2(x), 5))
        x = x.view(vocalnet_output_size, -1)
        return x

'-----------------------------------------------------LSTM Decoder-----------------------------------------------------'

class DecoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, no_of_emotions):
        """Set the hyper-parameters and build the layers."""
        super(DecoderLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
        self.linear = nn.Linear(hidden_size, seq_len)

    def forward(self, features):
        """Decode Vocal feature vectors and generates emotions"""
        hiddens, _ = self.lstm(features)
        outputs = self.linear(hiddens[-1])
        return outputs

using_vision_network = True
using_vocal_network = True
batch_size = 2
mega_batch_size = 2
hidden_size = 256
num_layers = 2
no_of_emotions = 6
seq_len = 150
use_CUDA = True
no_of_epochs = 1000
use_pretrained = False
test_mode = False
show_image = True 
resnet = models.resnet18(pretrained=False).cuda()  # Define resnet18 model
modules = list(resnet.children())[:-1]      # delete the last fc layer.
resnet = nn.Sequential(*modules)

'-----------------------------------Parameters NOT subject to change---------------------------------------------------'

len_waveform = 320  # This is the length of a 1 frame long waveform vector
vocalnet_output_size = 1280  # VocalNet outputs a 1280X1 feature vector
resnet18_output_size = 512  # Resnet Outputs a 1X512X1X1 feature vector.
if using_vision_network and using_vocal_network:
    LSTM_input_size = vocalnet_output_size+resnet18_output_size
elif using_vision_network:
    LSTM_input_size = resnet18_output_size
    LSTM_input_size = vocalnet_output_size 

Vocal_encoder = VocalNet()  # Define the vocalnet model
lstm_decoder = DecoderLSTM(input_size=LSTM_input_size, hidden_size=hidden_size, num_layers=num_layers,
                           no_of_emotions=no_of_emotions)  # Define the shared LSTM Decoder.
curr_epoch = 0
total = 0
correct_5 = 0
correct_2 =0

criterion = nn.MSELoss().cuda()
params = list(lstm_decoder.parameters()) +list(resnet.parameters())+ list(Vocal_encoder.parameters()) 
optimizer = torch.optim.Adam(params, lr=0.01)
'------------------------------------------Saving Intermediate Models--------------------------------------------------'

def save_checkpoint(state, is_final, filename='resnet18_vocalnet_MOSI_withLSTM_sample.pth.tar'):, filename)
    if is_final:
        shutil.copyfile(filename, 'model_final.pth.tar')

'-------------------------------------------Setting into train mode----------------------------------------------------'

combined_seq_total = ""
target_seq_total = ""
directory = "./all_mosi_sequences/train"
prev_loss = 0

sequences = {}
i = 0
forbidden = ["seq_c5xsKMxpXnc"]
for files in os.listdir(directory):
    if files[0:15] not in forbidden:
        i += 1
# you can't shuffle a dictionary, but what you can do is shuffle a list of its keys
keys = list(sequences.keys())

if use_pretrained:
    checkpoint = torch.load('resnet18_vocalnet_MOSI_withLSTM.pth.tar')
    use_pretrained = False

for epoch in range(curr_epoch, no_of_epochs):
    correct_5 = 0
    correct_2 =0    
    total_loss = 0
    input_list = [(key, sequences[key]) for key in keys]

    for j in range(0, len(input_list), batch_size):
        if j%mega_batch_size==0:
            # print("GRADIENT ZEROED")
            total_loss = 0

        if ((len(sequences) - j) > batch_size):
            input_batch = input_list[j:j+batch_size]

        for batch in range(batch_size):
            with open(directory+"/"+str(input_batch[batch][1]), 'rb') as f:
                data = pickle.load(f)
            target_seq = np.array(data[0], dtype = np.float32)
            vocal_seq = np.array(data[1], dtype = np.float32)
            vision_seq = data[2]
            vision_seq_i3 = np.empty((seq_len,3,224,224), dtype=np.float32)
            vocal_seq_i1 = np.empty((seq_len, 1,320), dtype = np.float32)

            for seq in range(seq_len):
                file_name = vision_seq[seq]
                img =".."+file_name[7:])
                pixels = np.array(img,dtype = np.uint8)/255.0
                mean = np.array([0.485, 0.456, 0.406])
                std = np.array([0.229, 0.224, 0.225])
                pixels = std * pixels + mean
                if show_image:
                    plt.imshow(pixels, interpolation='nearest')
                    show_image = False
                pixels = pixels.transpose(2, 0 ,1)
                vision_seq_i3[seq,:,:,:] = pixels 
                vocal_seq_i1[seq,:,:] = vocal_seq[seq]

            vision_seq_i4 = Variable(torch.FloatTensor(vision_seq_i3).cuda()).cuda()

            vision_seq_o =resnet(vision_seq_i4).view(seq_len, 1, resnet18_output_size)

            target_seq_o = Variable(torch.from_numpy(target_seq)).view(1, seq_len).cuda()

            vocal_seq_i = Variable(torch.from_numpy(vocal_seq_i1)).cuda()
            vocal_seq_o = Vocal_encoder(vocal_seq_i)
            vocal_seq_o = vocal_seq_o.view(seq_len, 1, vocalnet_output_size)

            if using_vision_network and using_vocal_network:              
                combined_seq_i =, vision_seq_o), 2).cuda()
            elif using_vision_network and not using_vocal_network:
                combined_seq_i = vision_seq_o
                combined_seq_i = vocal_seq_o
            if batch == 0:
                combined_seq_total = combined_seq_i
                target_seq_total = target_seq_o
                combined_seq_total =, combined_seq_i), 1)
                target_seq_total =, target_seq_o), 0)
            # print("DONE" + str(batch))
        # print(target_seq_total.size())
        # print(combined_seq_total.size())

        lstm_output = lstm_decoder(combined_seq_total)
        loss = criterion(lstm_output, target_seq_total)
        # print(lstm_output)
        # print(target_seq_total)

        # print(list(Vocal_encoder.parameters())[-1].grad)
        # print(list(lstm_decoder.parameters()))

        predicted =
        actual =
        predicted_5 = np.floor((predicted+3)*4.9999/6.0)
        actual_5 = np.floor((actual+3)*4.9999/6.0)
        predicted_2 = np.floor((predicted+3)*1.9999/6.0)
        actual_2 = np.floor((actual+3)*1.9999/6.0)
        correct_5 += (predicted_5 == actual_5).sum()
        correct_2 += (predicted_2 == actual_2).sum()
        total_loss +=[0]
        if (j+2)%mega_batch_size==0:
            total_frames = (j+2)*seq_len
            print('Training -- Epoch [%d], Sample [%d], Average Loss: %.4f, Accuracy: %.4f, Accuracy-2: %.4f'
                % (epoch+1, j+2, 2*total_loss/mega_batch_size, (100.0*correct_5)/total_frames,(100.0*correct_2)/total_frames))
        if (j+2)%mega_batch_size==0:
1 Like

It definitely sounds like your gradients aren’t right. I haven’t looked through your code, but in general, my advice would be to break it up into smaller functions that you can test individually. Good luck!

1 Like

Hi Richard. Yes I did that and the update is that there is definitely a problem in the LSTM.
As an experiment I froze the LSTM gradients only and directly allowed the output after the LSTM with to take gradients with the resnet and the vocalnet. And everything was normal.
So the resnet was even able to learn the initialization of the LSTM and modify it’s filters according to the output.
The gradients also didn’t decrease to the above values and my training accuracy increased every epoch . Now I’m not able to figure out what could possibly be wrong with my LSTM. It seems pretty simple.

@apaszke @smth Could you please help ? I have been trying a LOT. Any directions would be great

@codeislife99 I am also facing a similar issue, did your issue got resolved?

Yes, the issue was with the normalization of input data.I switched from using my normalization function to torch’s default image normalization and everything was fine.