I have resnet-18 which takes in an image , a cnn vocal encoder which takes in numpy vector . The outputs from them are concatenated and stacked into a sequence. This sequence is passed to an LSTM which gives me the output and then back-propagation through the entire above network.
The issue is that the gradients quickly decrease to very low values like 10e-10 or 10e-11 after around only 50-100 sequences(Each sequence consists of 150 outputs from resnet and the vocal encoder). And there is even no learning . The training accuracy is equivalent to picking a random number and the loss never goes down. If I shutoff the resnet or the vocal encoder separately with the LSTM the same is the case.
After looking into the data and the inputs I am pretty sure there is a bug in my code and some component is not being back propagated through. But I am not able to figure out which one. Help is appreciated !
"""----------------------------------------------------Imports-------------------------------------------------------"""
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import torchvision.models as models
from matplotlib import pyplot as plt
import numpy as np
import h5py
from PIL import Image
from sklearn.externals import joblib
import shutil
import os
import random
import pickle
import time
import gc
'-----------------------------------------------------Vocal Net--------------------------------------------------------'
class VocalNet(nn.Module):
def __init__(self):
super(VocalNet, self).__init__()
self.conv1 = nn.Conv1d(in_channels=1, out_channels=20, kernel_size=40, stride=1, padding=20)
self.conv2 = nn.Conv1d(in_channels=20, out_channels=40, kernel_size=40, stride=1, padding=20)
def forward(self, vocal_input):
x = F.leaky_relu(F.max_pool1d(self.conv1(vocal_input), 2))
x = F.leaky_relu(F.max_pool1d(self.conv2(x), 5))
x = x.view(vocalnet_output_size, -1)
return x
'-----------------------------------------------------LSTM Decoder-----------------------------------------------------'
class DecoderLSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, no_of_emotions):
"""Set the hyper-parameters and build the layers."""
super(DecoderLSTM, self).__init__()
self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
self.linear = nn.Linear(hidden_size, seq_len)
def forward(self, features):
"""Decode Vocal feature vectors and generates emotions"""
hiddens, _ = self.lstm(features)
outputs = self.linear(hiddens[-1])
return outputs
'------------------------------------------------------Hyperparameters-------------------------------------------------'
using_vision_network = True
using_vocal_network = True
batch_size = 2
mega_batch_size = 2
hidden_size = 256
num_layers = 2
no_of_emotions = 6
seq_len = 150
use_CUDA = True
no_of_epochs = 1000
use_pretrained = False
test_mode = False
show_image = True
'----------------------------------------------------------------------------------------------------------------------'
resnet = models.resnet18(pretrained=False).cuda() # Define resnet18 model
modules = list(resnet.children())[:-1] # delete the last fc layer.
resnet = nn.Sequential(*modules)
'-----------------------------------Parameters NOT subject to change---------------------------------------------------'
len_waveform = 320 # This is the length of a 1 frame long waveform vector
vocalnet_output_size = 1280 # VocalNet outputs a 1280X1 feature vector
resnet18_output_size = 512 # Resnet Outputs a 1X512X1X1 feature vector.
if using_vision_network and using_vocal_network:
LSTM_input_size = vocalnet_output_size+resnet18_output_size
elif using_vision_network:
LSTM_input_size = resnet18_output_size
else:
LSTM_input_size = vocalnet_output_size
Vocal_encoder = VocalNet() # Define the vocalnet model
lstm_decoder = DecoderLSTM(input_size=LSTM_input_size, hidden_size=hidden_size, num_layers=num_layers,
no_of_emotions=no_of_emotions) # Define the shared LSTM Decoder.
curr_epoch = 0
total = 0
correct_5 = 0
correct_2 =0
'----------------------------------------------------------------------------------------------------------------------'
criterion = nn.MSELoss().cuda()
params = list(lstm_decoder.parameters()) +list(resnet.parameters())+ list(Vocal_encoder.parameters())
optimizer = torch.optim.Adam(params, lr=0.01)
'------------------------------------------Saving Intermediate Models--------------------------------------------------'
def save_checkpoint(state, is_final, filename='resnet18_vocalnet_MOSI_withLSTM_sample.pth.tar'):
torch.save(state, filename)
if is_final:
shutil.copyfile(filename, 'model_final.pth.tar')
'-------------------------------------------Setting into train mode----------------------------------------------------'
lstm_decoder.zero_grad()
Vocal_encoder.zero_grad()
Vocal_encoder.cuda()
lstm_decoder.cuda()
lstm_decoder.train()
Vocal_encoder.train()
resnet.train()
'----------------------------------------------------------------------------------------------------------------------'
combined_seq_total = ""
target_seq_total = ""
directory = "./all_mosi_sequences/train"
prev_loss = 0
sequences = {}
i = 0
forbidden = ["seq_c5xsKMxpXnc"]
for files in os.listdir(directory):
if files[0:15] not in forbidden:
sequences.update({i:files})
i += 1
# you can't shuffle a dictionary, but what you can do is shuffle a list of its keys
keys = list(sequences.keys())
if use_pretrained:
checkpoint = torch.load('resnet18_vocalnet_MOSI_withLSTM.pth.tar')
lstm_decoder.load_state_dict(checkpoint['lstm_decoder'])
Vocal_encoder.load_state_dict(checkpoint['Vocal_encoder'])
resnet.train().load_state_dict(checkpoint['resnet18'])
optimizer.load_state_dict(checkpoint['optimizer'])
use_pretrained = False
random.shuffle(keys)
for epoch in range(curr_epoch, no_of_epochs):
correct_5 = 0
correct_2 =0
total_loss = 0
if(test_mode):
break
lstm_decoder.zero_grad()
Vocal_encoder.zero_grad()
resnet.zero_grad()
input_list = [(key, sequences[key]) for key in keys]
for j in range(0, len(input_list), batch_size):
if j%mega_batch_size==0:
# print("GRADIENT ZEROED")
total_loss = 0
optimizer.zero_grad()
lstm_decoder.zero_grad()
Vocal_encoder.zero_grad()
resnet.zero_grad()
if ((len(sequences) - j) > batch_size):
input_batch = input_list[j:j+batch_size]
else:
break
for batch in range(batch_size):
with open(directory+"/"+str(input_batch[batch][1]), 'rb') as f:
data = pickle.load(f)
target_seq = np.array(data[0], dtype = np.float32)
vocal_seq = np.array(data[1], dtype = np.float32)
vision_seq = data[2]
vision_seq_i3 = np.empty((seq_len,3,224,224), dtype=np.float32)
vocal_seq_i1 = np.empty((seq_len, 1,320), dtype = np.float32)
for seq in range(seq_len):
file_name = vision_seq[seq]
img = Image.open(".."+file_name[7:])
pixels = np.array(img,dtype = np.uint8)/255.0
mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])
pixels = std * pixels + mean
if show_image:
plt.imshow(pixels, interpolation='nearest')
plt.show()
show_image = False
pixels = pixels.transpose(2, 0 ,1)
vision_seq_i3[seq,:,:,:] = pixels
vocal_seq_i1[seq,:,:] = vocal_seq[seq]
vision_seq_i4 = Variable(torch.FloatTensor(vision_seq_i3).cuda()).cuda()
vision_seq_o =resnet(vision_seq_i4).view(seq_len, 1, resnet18_output_size)
target_seq_o = Variable(torch.from_numpy(target_seq)).view(1, seq_len).cuda()
vocal_seq_i = Variable(torch.from_numpy(vocal_seq_i1)).cuda()
vocal_seq_o = Vocal_encoder(vocal_seq_i)
vocal_seq_o = vocal_seq_o.view(seq_len, 1, vocalnet_output_size)
if using_vision_network and using_vocal_network:
combined_seq_i = torch.cat((vocal_seq_o, vision_seq_o), 2).cuda()
elif using_vision_network and not using_vocal_network:
combined_seq_i = vision_seq_o
else:
combined_seq_i = vocal_seq_o
if batch == 0:
combined_seq_total = combined_seq_i
target_seq_total = target_seq_o
else:
combined_seq_total = torch.cat((combined_seq_total, combined_seq_i), 1)
target_seq_total = torch.cat((target_seq_total, target_seq_o), 0)
# print("DONE" + str(batch))
# print(target_seq_total.size())
# print(combined_seq_total.size())
lstm_output = lstm_decoder(combined_seq_total)
loss = criterion(lstm_output, target_seq_total)
# print(lstm_output)
# print(target_seq_total)
loss.backward()
print(list(resnet.train().parameters())[-1].grad[0])
print(list(resnet.parameters())[-1][0])
# print(list(Vocal_encoder.parameters())[-1].grad)
# print(list(lstm_decoder.parameters()))
predicted = lstm_output.data.cpu().numpy()
actual = target_seq_total.data.cpu().numpy()
predicted_5 = np.floor((predicted+3)*4.9999/6.0)
actual_5 = np.floor((actual+3)*4.9999/6.0)
predicted_2 = np.floor((predicted+3)*1.9999/6.0)
actual_2 = np.floor((actual+3)*1.9999/6.0)
correct_5 += (predicted_5 == actual_5).sum()
correct_2 += (predicted_2 == actual_2).sum()
total_loss += loss.data[0]
if (j+2)%mega_batch_size==0:
total_frames = (j+2)*seq_len
print('Training -- Epoch [%d], Sample [%d], Average Loss: %.4f, Accuracy: %.4f, Accuracy-2: %.4f'
% (epoch+1, j+2, 2*total_loss/mega_batch_size, (100.0*correct_5)/total_frames,(100.0*correct_2)/total_frames))
if (j+2)%mega_batch_size==0:
optimizer.step()