I tried to implement the model which learn to localize the action in a video.
the model is composed of 2 parts, features extraction part with CNN and sequence recognition part with RNN, and use CTC loss to predict class in each sequence as the code below.
class ResCNNEncoder(nn.Module):
def __init__(self, CNN_embed_dim=128,fc_hidden = (128,128), drop_p=0.3):
super(ResCNNEncoder, self).__init__()
self.CNN_embed_dim = CNN_embed_dim
self.drop_p = drop_p
self.fc_hidden1, self.fc_hidden2 = fc_hidden
resnet = models.resnet18(pretrained=True)
resnet.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
modules = list(resnet.children())[:-1] #delete the last fc layer.
self.resnet = nn.Sequential(*modules)
self.fc1 = nn.Linear(resnet.fc.in_features, self.fc_hidden1)
self.fc2 = nn.Linear(self.fc_hidden1, self.fc_hidden2)
self.fc3 = nn.Linear(self.fc_hidden2, self.CNN_embed_dim)
def forward(self, x_3d):
cnn_embed_seq = []
for t in range(x_3d.size(1)):
# ResNet CNN
x = self.resnet(x_3d[:, t, :, :,:]) # ResNet
x = F.relu(x)
x = x.view(x.size(0), -1) # flatten output of conv
# FC layers
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = F.dropout(x, p=self.drop_p, training=self.training)
cnn_embed_seq.append(x)
cnn_embed_seq = torch.stack(cnn_embed_seq, dim=0).transpose_(0, 1)
return cnn_embed_seq
class DecoderRNN(nn.Module):
def __init__(self, CNN_embed_dim=300, h_RNN=256, drop_p=0.3, num_classes=2):
super(DecoderRNN, self).__init__()
self.RNN_input_size = CNN_embed_dim
self.h_RNN_layers = h_RNN_layers # RNN hidden layers
self.h_RNN = h_RNN # RNN hidden nodes
self.h_FC_dim = h_FC_dim
self.num_classes = num_classes
self.LSTM_first = nn.LSTM(
input_size=self.RNN_input_size,
hidden_size=self.h_RNN,
num_layers=1,
batch_first=True,
)
self.LSTM_mid = nn.LSTM(
input_size=self.h_RNN,
hidden_size=self.h_RNN,
num_layers=1,
batch_first=True,
)
self.LSTM_last = nn.LSTM(
input_size=self.h_RNN,
hidden_size=self.num_classes,
num_layers=1,
batch_first=True,
)
def forward(self, x):
x = F.relu(self.LSTM_first(x)[0])
x = F.relu(self.LSTM_mid(x)[0])
x = F.relu(self.LSTM_mid(x)[0])
x = F.relu(self.LSTM_mid(x)[0])
x = F.relu(self.LSTM_mid(x)[0])
x = F.relu(self.LSTM_last(x)[0])
return x
my problem is after I train for a few iterations the prediction result become blank or only one class in all sequence. it seems the model can not adjust any weight and learn nothing.
the model’s input is a stack of video frames array has dimension (1, number of a video’s frame,1,128,128)
the output of the model has dimension (number of a video’s frame, 1, 40)
cnn_encoder = ResCNNEncoder(CNN_embed_dim=128, fc_hidden=(128,128), drop_p=0.3).to(device)
rnn_decoder = DecoderRNN(CNN_embed_dim=128,h_RNN=256, drop_p=0.3, num_classes=40).to(device)
params = list(cnn_encoder.parameters()) + list(rnn_decoder.parameters())
optimizer = torch.optim.Adam(params, lr=learning_rate)
ctc_loss = nn.CTCLoss(blank=0, reduction='none')
for video_idex in range(0,len(X_train)) :
X = load_video(data_path, X_train[video_idex], (128,128)) #this function is return stack array of video frames the dimension of X is (1, number of a video's frame,1,128,128)
target = y_train[video_idex][0:X.shape[1]] #target sequences of loss function excluding blank(0)
X = X/255
X = torch.tensor(X)
target = torch.tensor(target).view(1,-1)
X = X.to(device=device, dtype=torch.float)
target = target.to(device=device, dtype=torch.long)
optimizer.zero_grad()
output = rnn_decode(cnn_encode(X))
output = output.view(output.shape[1],1,-1)
input = output.log_softmax(2).requires_grad_()
input_lengths = torch.full(size=(1,), fill_value=output.shape[0], dtype=torch.long)
target_lengths = torch.full(size=(1,), fill_value=target.shape[1], dtype=torch.long)
step_loss = ctc_loss(input, target, input_lengths, target_lengths)
step_loss.backward()
optimizer.step()
I tired to chang optimizer and decrese/incease learning rate but still the same problem.
Pannattee,
Best regards