Cnn+rnn outputs and loss always the same

Hi everyone,

i’m trying to build an imitation learning model where i have a set of videos (divided into frames), where a robot is moving in an environment. My gts are the poses of the robot in each frame. the network is composed by a cnn to process the frames and a rnn that takes the features of the current frame, and 4 previous frame together with their outputted poses, to predict the next pose of the robot.

The problem is that the network always outputs very similar values and loss is pretty much the same every time i run the code and it doen’t decrease through epochs. i tried changing activation functions, and hyperparameters but the problem remains.

this is my code: class CNN(nn.Module):
def init(self):
super(CNN, self).init()
self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3)
self.pool1 = nn.MaxPool2d(kernel_size=2)
self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3)
self.pool2 = nn.MaxPool2d(kernel_size=2)
self.conv3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3)
self.pool3 = nn.MaxPool2d(kernel_size=2)

    self.fc1 = nn.Linear(in_features=43264, out_features=512)
    #self.fc2 = nn.Linear(in_features=128, out_features=64)
def forward(self, x):
    x = self.conv1(x)
    #x = F.relu(x)
    x = self.pool1(x)
    x = self.conv2(x)
    #x = F.relu(x)
    x = self.pool2(x)
    x = self.conv3(x)
    #x = F.relu(x)
    x = self.pool3(x)

    x = torch.flatten(x, start_dim=0)
    x = self.fc1(x)
    #x = F.relu(x)
    #x = self.fc2(x)
    #x = F.relu(x)
    return x

#sliding window rnn
class RNN(nn.Module):
def init(self, input_size, hidden_size, num_layers, output_size):
super(RNN, self).init()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)
self.fc = nn.Linear(hidden_size, output_size)

def forward(self, x):
    #batch_size = x.size(0)
    seq_length = x.size(1) #number of frames taken in the sequence
    h0 = (torch.zeros(self.num_layers, 1, self.hidden_size), torch.zeros(self.num_layers, 1, self.hidden_size))
    hn = h0

    predictions = []
    # always consider a sequence of fixed length 

    input_seq_length = 5 

    if seq_length >= 5:          # take just last input_seq_length elements
      xi = x[:, -input_seq_length:, :]
      out, hn = self.rnn(xi, hn)
      yi = self.fc(out[:,-1,:])
    else:                        # take all elements from the beginning
      xi = x[:, :seq_length, :]
      out, hn = self.rnn(xi, hn)
      yi = self.fc(out[:,-1,:])

    predictions = torch.stack(predictions, dim=1)
    return predictions 

input_size = 524
hidden_size = 128
num_layers = 1
output_size = 12
cnn_model = CNN()
#cnn_model = ResNet()
rnn_model = RNN(input_size, hidden_size, num_layers, output_size)
num_epochs = 100
optimizer = optim.Adam(list(cnn_model.parameters()) + list(rnn_model.parameters()), lr=0.00001)

losses = []
best_loss = float(‘inf’)


for epoch in range(num_epochs):

videos_loss = 0

for i in range(len(X_train)): #for each video


  prev_outputs = torch.clone(y_train[i][1][0]).unsqueeze(0)
  prev_frames = cnn_model(X_train[i][1][0]).unsqueeze(0)

  video_loss = 0
  for j in range(len(X_train[i][1])-1): #for each frame (apart from the last one)

        features = cnn_model(X_train[i][1][j]).unsqueeze(0) # shape: (1, cnn_output_size)
        if j == 0:
          x =, prev_outputs), dim=1).unsqueeze(0)  # shape: (1, 1, cnn_output_size + output_size))

          x =, features), dim=0)
          x =, prev_outputs), dim=1).unsqueeze(0)  # shape: (1, seq_length, cnn_output_size + output_size))

        output = rnn_model(x)
        prev_outputs =, output.squeeze(0)), dim=0)
        prev_frames =, features))
        frame_loss = F.l1_loss(output.squeeze(0).squeeze(0), y_train[i][1][j+1])
        video_loss += frame_loss
  video_loss /= avg_n_frames
  videos_loss += video_loss.item()    
print("Epoch", epoch + 1, "train loss:", videos_loss/len(X_train)) 

thank you very much to everyone!!