Why is loss not converging?

CMJr · March 25, 2024, 11:38pm

I’m trying to make a deep Q network for the Lunar Lander v2 environment. I got the model to run, but it isn’t converging at all; the loss and environment rewards aren’t improving. What’s wrong with my code?
Here’s a link to my code on Colab: Google Colab
Some of the more important parts:

def __init__(self):
        super(ActorCritic, self).__init__()

        self.action_layer = nn.Linear(128, 4)
        self.value_layer = nn.Linear(128, 1)

        self.state_size = 8
        self.action_size = 4

        self.state_dense = nn.Linear(self.state_size, 256)
        self.action_dense = nn.Linear(self.action_size, 256)

        self.hl1 = nn.Linear(256, 512)
        self.dp1 = nn.Dropout(0.25)
        self.hl2 = nn.Linear(512, 512)
        self.dp2 = nn.Dropout(0.25)

        self.outputs = nn.Linear(512, 1)

        self.state_values = []
        self.rewards = []

    def forward(self, state, action):

        state = torch.from_numpy(state).float()
        state = F.relu(self.state_dense(state))

        action = np.array(action)
        action = torch.from_numpy(action).float()
        action = F.relu(self.action_dense(action))

        combined = torch.mul(state, action)

        value = F.relu(self.hl1(combined))
        value = self.dp1(value)
        value = F.relu(self.hl2(value))
        value = self.dp2(value)
        value = self.outputs(value)

        self.state_values.append(value)

        return value

    def calculateLoss(self, gamma=0.99):

        # calculating discounted rewards:
        rewards = []
        dis_reward = 0
        for reward in self.rewards[::-1]:
            dis_reward = reward + gamma * dis_reward
            rewards.insert(0, dis_reward)

        # normalizing the rewards:
        rewards = torch.tensor(rewards)
        rewards = (rewards - rewards.mean()) / (rewards.std())

        loss = 0
        for value, reward in zip(self.state_values, rewards):
            value = value.to(torch.float64)
            value_loss = F.mse_loss(value, reward)
            loss += value_loss
        return loss

How I update the network:

optimizer.zero_grad()
loss = policy.calculateLoss(gamma)
loss.backward()
optimizer.step()
policy.clearMemory()