Hi everyone,
i’m trying to build an imitation learning model where i have a set of videos (divided into frames), where a robot is moving in an environment. My gts are the poses of the robot in each frame. the network is composed by a cnn to process the frames and a rnn that takes the features of the current frame, and 4 previous frame together with their outputted poses, to predict the next pose of the robot.
The problem is that the network always outputs very similar values and loss is pretty much the same every time i run the code and it doen’t decrease through epochs. i tried changing activation functions, and hyperparameters but the problem remains.
this is my code: class CNN(nn.Module):
def init(self):
super(CNN, self).init()
self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3)
self.pool1 = nn.MaxPool2d(kernel_size=2)
self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3)
self.pool2 = nn.MaxPool2d(kernel_size=2)
self.conv3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3)
self.pool3 = nn.MaxPool2d(kernel_size=2)
self.fc1 = nn.Linear(in_features=43264, out_features=512)
#self.fc2 = nn.Linear(in_features=128, out_features=64)
def forward(self, x):
x = self.conv1(x)
#x = F.relu(x)
x = self.pool1(x)
x = self.conv2(x)
#x = F.relu(x)
x = self.pool2(x)
x = self.conv3(x)
#x = F.relu(x)
x = self.pool3(x)
x = torch.flatten(x, start_dim=0)
x = self.fc1(x)
#print(x.shape)
#x = F.relu(x)
#x = self.fc2(x)
#x = F.relu(x)
#print(x)
return x
#sliding window rnn
class RNN(nn.Module):
def init(self, input_size, hidden_size, num_layers, output_size):
super(RNN, self).init()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x):
#batch_size = x.size(0)
seq_length = x.size(1) #number of frames taken in the sequence
h0 = (torch.zeros(self.num_layers, 1, self.hidden_size), torch.zeros(self.num_layers, 1, self.hidden_size))
hn = h0
predictions = []
#print(x.shape)
# always consider a sequence of fixed length
input_seq_length = 5
if seq_length >= 5: # take just last input_seq_length elements
xi = x[:, -input_seq_length:, :]
#print(xi.shape)
out, hn = self.rnn(xi, hn)
yi = self.fc(out[:,-1,:])
predictions.append(yi)
else: # take all elements from the beginning
xi = x[:, :seq_length, :]
#print(xi.shape)
out, hn = self.rnn(xi, hn)
yi = self.fc(out[:,-1,:])
predictions.append(yi)
predictions = torch.stack(predictions, dim=1)
return predictions
input_size = 524
hidden_size = 128
num_layers = 1
output_size = 12
cnn_model = CNN()
#cnn_model = ResNet()
rnn_model = RNN(input_size, hidden_size, num_layers, output_size)
num_epochs = 100
optimizer = optim.Adam(list(cnn_model.parameters()) + list(rnn_model.parameters()), lr=0.00001)
#TRAIN
losses = []
best_loss = float(‘inf’)
cnn_model.train()
rnn_model.train()
for epoch in range(num_epochs):
videos_loss = 0
for i in range(len(X_train)): #for each video
optimizer.zero_grad()
prev_outputs = torch.clone(y_train[i][1][0]).unsqueeze(0)
prev_frames = cnn_model(X_train[i][1][0]).unsqueeze(0)
video_loss = 0
for j in range(len(X_train[i][1])-1): #for each frame (apart from the last one)
#CNN
features = cnn_model(X_train[i][1][j]).unsqueeze(0) # shape: (1, cnn_output_size)
if j == 0:
x = torch.cat((features, prev_outputs), dim=1).unsqueeze(0) # shape: (1, 1, cnn_output_size + output_size))
else:
x = torch.cat((prev_frames, features), dim=0)
x = torch.cat((prev_frames, prev_outputs), dim=1).unsqueeze(0) # shape: (1, seq_length, cnn_output_size + output_size))
#print(x.shape)
#RNN
output = rnn_model(x)
prev_outputs = torch.cat((prev_outputs.clone(), output.squeeze(0)), dim=0)
prev_frames = torch.cat((prev_frames.clone(), features))
frame_loss = F.l1_loss(output.squeeze(0).squeeze(0), y_train[i][1][j+1])
video_loss += frame_loss
#frame_loss.backward()
#optimizer.step()
video_loss /= avg_n_frames
video_loss.backward(retain_graph=False)
optimizer.step()
videos_loss += video_loss.item()
print("Epoch", epoch + 1, "train loss:", videos_loss/len(X_train))
thank you very much to everyone!!