DQN Failing to solve Lunar Lander

I'm current trying to train a model to play Lunar Lander from the openAI gym using a DQN, but I cannot get the agent to "solve" the environment. I've previously managed to train agents using REINFORCE and REINFORCE with baseline to solve it.

I’ve tried toying with every parameter I can think of and changing network architecture but nothing seems to actually help. Adding gradient clipping helped stabilize it somewhat but still doesn’t get the agent to solve the environment like I was able to for both REINFORCE agents.

A typical training session for the DQN agent, where I get the average score over 100 episodes is as follows

100 game avg after 50 episodes: -583.17 - Eps: 0.73
100 game avg after 100 episodes: -489.79 - Eps: 0.50
100 game avg after 150 episodes: -523.68 - Eps: 0.30
100 game avg after 200 episodes: -136.49 - Eps: 0.10
100 game avg after 250 episodes: -126.62 - Eps: 0.01
100 game avg after 300 episodes: -858.92 - Eps: 0.01
100 game avg after 350 episodes: -416.21 - Eps: 0.01
100 game avg after 400 episodes: -125.24 - Eps: 0.01
100 game avg after 450 episodes: -126.12 - Eps: 0.01
100 game avg after 500 episodes: -195.42 - Eps: 0.01

It hovers around -130 average reward, then spikes down for an evaluation or two, then goes back to around -130. I've been trying to figure this out for a couple days now and if it weren't for the fact I've seen others solve this environment using DQN I'd figure it can't be done.

I’m not sure if it is something small I’ve missed that I’m blind to or what, but any insight would be appreciated.

ReplayBuffer.py

import numpy as np
import torch
class ReplayBuffer:
    def __init__(self,buffer_size,in_dims,batch_size,device):
        self.buffer_size = buffer_size
        self.in_dims = in_dims
        self.batch_size = batch_size
        self.device = device
        self.ptr = 0
        self.items_in_buffer = 0
        self.states = np.zeros((buffer_size,*in_dims),dtype=np.float32)
        self.next_states = np.zeros((buffer_size,*in_dims),dtype=np.float32)
        self.actions = np.zeros(buffer_size,dtype=np.int32)
        self.rewards = np.zeros(buffer_size,dtype=np.float32)
        self.done = np.zeros(buffer_size,dtype=np.bool_)

    def add(self,state,state_,action,reward,done):
        self.states[self.ptr] = state
        self.next_states[self.ptr] = state_
        self.actions[self.ptr] = action
        self.rewards[self.ptr] = reward
        self.done[self.ptr] = done

        self.ptr  = (self.ptr+1)%self.buffer_size
        self.items_in_buffer = min(self.items_in_buffer+1,self.buffer_size)

    def sample(self):
        inds = np.random.choice(self.items_in_buffer,self.batch_size,replace=False)

        states = torch.Tensor(self.states[inds]).to(self.device)
        states_ = torch.Tensor(self.next_states[inds]).to(self.device)
        actions = torch.LongTensor(self.actions[inds]).to(self.device)
        rewards = torch.Tensor(self.rewards[inds]).to(self.device)
        done = torch.IntTensor(self.done[inds]).to(self.device)

        return (states,states_,actions,rewards,done)

    def is_sampleable(self):
        return self.items_in_buffer > self.batch_size

Agent.py

import torch
import torch.nn as nn
import numpy as np
import ReplayBuffer as rb

class QNetwork(nn.Module):
    def __init__(self,inputs,outputs,lr,device):
        super(QNetwork,self).__init__()
        self.model = nn.Sequential(
            nn.Linear(inputs,64),
            nn.ReLU(),
            nn.Linear(64,outputs)
        )
        self.optim = torch.optim.Adam(self.model.parameters(),lr=lr)
        self.loss = nn.MSELoss()
        self.to(device)

    def forward(self,x):
        return self.model(x)
    
class QAgent:
    def __init__(self,name,in_dims,out_size,action_space,lr,device):
        self.QNet = QNetwork(in_dims[0],out_size,lr,device)
        self.QNetTarg = QNetwork(in_dims[0],out_size,lr,device)
        self.name = name
        self.in_dims = in_dims
        self.action_space = action_space
        self.device=device
        self.updates = 0

        self.eps = 0
        self.eps_decay = 0
        self.eps_min = 0
        self.train_freq = 0
        self.num_steps = 0
        self.gamma = 1

        self.replay_buffer = rb.ReplayBuffer(100000,in_dims,64,device)

    def __call__(self,x) -> torch.Tensor:
        return self.QNet(x)
    
    def select_action(self,x,eval:bool):
        if np.random.uniform() > self.eps or eval:
            eval = self(torch.Tensor(x,device=self.device))
            action = eval.argmax().item()
        else:
            action = np.random.choice(self.action_space)

        return action
    
    def update(self):
        #self.num_steps%self.train_freq != 0 or
        if self.replay_buffer.is_sampleable() == False:
            return
        
        state,state_,action,reward,done = self.replay_buffer.sample()
        
        q_eval = self(state).gather(1,action.unsqueeze(0))

        q_next = self.QNetTarg(state_).detach()
        q_next = q_next.max(dim=1)[0].unsqueeze(0)
       
        q_targ = reward + self.gamma * q_next * (1-done)

        loss = self.QNet.loss(q_eval,q_targ)

        self.QNet.optim.zero_grad()
        loss.backward()
        for param in self.QNet.parameters():
            param.grad.data.clamp_(-1, 1)

        self.QNet.optim.step()

        self.eps = max(self.eps-self.eps_decay,self.eps_min)

        self.updates += 1

        #copy QNet to target net every 1000 updates
        if self.updates % 1000 == 0:
            self.QNetTarg.load_state_dict(self.QNet.state_dict())

    
    def play_episode(self,env,eval):
        ep_reward = 0
        observation,info = env.reset()

        while True:
            action = self.select_action(observation,eval)
            observation_, reward, terminated, truncated, info = env.step(action)
            self.num_steps += 1
            ep_reward += reward
            if not eval:
                self.replay_buffer.add(observation,observation_,action,reward,terminated or truncated)
                self.update()
            observation = observation_

            if truncated or terminated:
                return ep_reward
            
    def eval_agent(self,env,num_games = 100):
        rewards = []

        for _ in range(num_games):
            r = self.play_episode(env,True)
            rewards.append(r)

        rewards = np.array(rewards)
        return rewards.mean()
            
    def train(self,env,gamma:float,num_games:int,eps:float,eps_min:float,eps_decay:float,train_freq:int,eval_freq:int):
        self.gamma = gamma
        self.eps = eps
        self.eps_min = eps_min
        self.eps_decay = eps_decay
        self.train_freq = train_freq

        eval_rewards = []
        train_rewards = []
        

        for i in range(num_games):
            r = self.play_episode(env,False)
            train_rewards.append(r)
            print("%d/%d"%(i,num_games),end="\r")
            if (i+1)%eval_freq == 0:
                avg = self.eval_agent(env)
                # eval_rewards[i+1] = avg
                eval_rewards.append(avg)
                print("100 game avg after %d episodes: %.2f - Eps: %.2f"%(i+1,avg,self.eps))


        return train_rewards,eval_rewards

Controller.py

import gymnasium as gym
import Agent as ag
import torch
from matplotlib import pyplot as plt

device = torch.device('cpu')
env = gym.make("LunarLander-v2", render_mode=None)
agent = ag.QAgent("test",[8],4,[0,1,2,3],5e-4,device)


train_scores,eval_scores = agent.train(env,
            gamma=.99,
            num_games = 500,
            eps = 1.0,
            eps_min = .01,
            eps_decay = 5e-5,
            train_freq = 100,
            eval_freq = 50)

plt.subplot(2,1,1)
plt.plot(train_scores,label="Train Scores")
plt.legend()

plt.subplot(212)
plt.plot(eval_scores,label="Eval Scores")
plt.legend()

plt.show()

env.close()

I’ve come back around to this after implementing a couple other models and I’ve fixed at least part of the issue. It is able to learn somewhat, but it is still very unstable and is only able to achieve positive rewards for a short time before sliding back to poor behavior and fails to reach the same performance as other agents I’ve implemented.

Inside the update method in the Agent class the line

q_eval = self(state).gather(1,action.unsqueeze(0))

is wrong. The unsqueeze on the actions was changing the shape to (1,batch_size). This then was causing the gather to only select values from the first element of the state evaluations.

So given state evaluations

tensor([[ 1.6790, -0.7743, -0.0967, -0.8329],
        [ 0.0961,  0.0018, -0.4497,  0.5906],
        [-0.1898,  0.8576,  0.6413, -1.6045],
        [ 0.5393, -0.1071,  1.0365,  0.8837]])

and actions [0,0,0,0], gather was returning

tensor([[1.6790, 1.6790, 1.6790, 1.6790]])

instead of

tensor([[1.6790, 0.0961, -0.1898, 0.5393]])