I'm current trying to train a model to play Lunar Lander from the openAI gym using a DQN, but I cannot get the agent to "solve" the environment. I've previously managed to train agents using REINFORCE and REINFORCE with baseline to solve it.
I’ve tried toying with every parameter I can think of and changing network architecture but nothing seems to actually help. Adding gradient clipping helped stabilize it somewhat but still doesn’t get the agent to solve the environment like I was able to for both REINFORCE agents.
A typical training session for the DQN agent, where I get the average score over 100 episodes is as follows
100 game avg after 50 episodes: -583.17 - Eps: 0.73 100 game avg after 100 episodes: -489.79 - Eps: 0.50 100 game avg after 150 episodes: -523.68 - Eps: 0.30 100 game avg after 200 episodes: -136.49 - Eps: 0.10 100 game avg after 250 episodes: -126.62 - Eps: 0.01 100 game avg after 300 episodes: -858.92 - Eps: 0.01 100 game avg after 350 episodes: -416.21 - Eps: 0.01 100 game avg after 400 episodes: -125.24 - Eps: 0.01 100 game avg after 450 episodes: -126.12 - Eps: 0.01 100 game avg after 500 episodes: -195.42 - Eps: 0.01
It hovers around -130 average reward, then spikes down for an evaluation or two, then goes back to around -130. I've been trying to figure this out for a couple days now and if it weren't for the fact I've seen others solve this environment using DQN I'd figure it can't be done.
I’m not sure if it is something small I’ve missed that I’m blind to or what, but any insight would be appreciated.
ReplayBuffer.py
import numpy as np
import torch
class ReplayBuffer:
def __init__(self,buffer_size,in_dims,batch_size,device):
self.buffer_size = buffer_size
self.in_dims = in_dims
self.batch_size = batch_size
self.device = device
self.ptr = 0
self.items_in_buffer = 0
self.states = np.zeros((buffer_size,*in_dims),dtype=np.float32)
self.next_states = np.zeros((buffer_size,*in_dims),dtype=np.float32)
self.actions = np.zeros(buffer_size,dtype=np.int32)
self.rewards = np.zeros(buffer_size,dtype=np.float32)
self.done = np.zeros(buffer_size,dtype=np.bool_)
def add(self,state,state_,action,reward,done):
self.states[self.ptr] = state
self.next_states[self.ptr] = state_
self.actions[self.ptr] = action
self.rewards[self.ptr] = reward
self.done[self.ptr] = done
self.ptr = (self.ptr+1)%self.buffer_size
self.items_in_buffer = min(self.items_in_buffer+1,self.buffer_size)
def sample(self):
inds = np.random.choice(self.items_in_buffer,self.batch_size,replace=False)
states = torch.Tensor(self.states[inds]).to(self.device)
states_ = torch.Tensor(self.next_states[inds]).to(self.device)
actions = torch.LongTensor(self.actions[inds]).to(self.device)
rewards = torch.Tensor(self.rewards[inds]).to(self.device)
done = torch.IntTensor(self.done[inds]).to(self.device)
return (states,states_,actions,rewards,done)
def is_sampleable(self):
return self.items_in_buffer > self.batch_size
Agent.py
import torch
import torch.nn as nn
import numpy as np
import ReplayBuffer as rb
class QNetwork(nn.Module):
def __init__(self,inputs,outputs,lr,device):
super(QNetwork,self).__init__()
self.model = nn.Sequential(
nn.Linear(inputs,64),
nn.ReLU(),
nn.Linear(64,outputs)
)
self.optim = torch.optim.Adam(self.model.parameters(),lr=lr)
self.loss = nn.MSELoss()
self.to(device)
def forward(self,x):
return self.model(x)
class QAgent:
def __init__(self,name,in_dims,out_size,action_space,lr,device):
self.QNet = QNetwork(in_dims[0],out_size,lr,device)
self.QNetTarg = QNetwork(in_dims[0],out_size,lr,device)
self.name = name
self.in_dims = in_dims
self.action_space = action_space
self.device=device
self.updates = 0
self.eps = 0
self.eps_decay = 0
self.eps_min = 0
self.train_freq = 0
self.num_steps = 0
self.gamma = 1
self.replay_buffer = rb.ReplayBuffer(100000,in_dims,64,device)
def __call__(self,x) -> torch.Tensor:
return self.QNet(x)
def select_action(self,x,eval:bool):
if np.random.uniform() > self.eps or eval:
eval = self(torch.Tensor(x,device=self.device))
action = eval.argmax().item()
else:
action = np.random.choice(self.action_space)
return action
def update(self):
#self.num_steps%self.train_freq != 0 or
if self.replay_buffer.is_sampleable() == False:
return
state,state_,action,reward,done = self.replay_buffer.sample()
q_eval = self(state).gather(1,action.unsqueeze(0))
q_next = self.QNetTarg(state_).detach()
q_next = q_next.max(dim=1)[0].unsqueeze(0)
q_targ = reward + self.gamma * q_next * (1-done)
loss = self.QNet.loss(q_eval,q_targ)
self.QNet.optim.zero_grad()
loss.backward()
for param in self.QNet.parameters():
param.grad.data.clamp_(-1, 1)
self.QNet.optim.step()
self.eps = max(self.eps-self.eps_decay,self.eps_min)
self.updates += 1
#copy QNet to target net every 1000 updates
if self.updates % 1000 == 0:
self.QNetTarg.load_state_dict(self.QNet.state_dict())
def play_episode(self,env,eval):
ep_reward = 0
observation,info = env.reset()
while True:
action = self.select_action(observation,eval)
observation_, reward, terminated, truncated, info = env.step(action)
self.num_steps += 1
ep_reward += reward
if not eval:
self.replay_buffer.add(observation,observation_,action,reward,terminated or truncated)
self.update()
observation = observation_
if truncated or terminated:
return ep_reward
def eval_agent(self,env,num_games = 100):
rewards = []
for _ in range(num_games):
r = self.play_episode(env,True)
rewards.append(r)
rewards = np.array(rewards)
return rewards.mean()
def train(self,env,gamma:float,num_games:int,eps:float,eps_min:float,eps_decay:float,train_freq:int,eval_freq:int):
self.gamma = gamma
self.eps = eps
self.eps_min = eps_min
self.eps_decay = eps_decay
self.train_freq = train_freq
eval_rewards = []
train_rewards = []
for i in range(num_games):
r = self.play_episode(env,False)
train_rewards.append(r)
print("%d/%d"%(i,num_games),end="\r")
if (i+1)%eval_freq == 0:
avg = self.eval_agent(env)
# eval_rewards[i+1] = avg
eval_rewards.append(avg)
print("100 game avg after %d episodes: %.2f - Eps: %.2f"%(i+1,avg,self.eps))
return train_rewards,eval_rewards
Controller.py
import gymnasium as gym
import Agent as ag
import torch
from matplotlib import pyplot as plt
device = torch.device('cpu')
env = gym.make("LunarLander-v2", render_mode=None)
agent = ag.QAgent("test",[8],4,[0,1,2,3],5e-4,device)
train_scores,eval_scores = agent.train(env,
gamma=.99,
num_games = 500,
eps = 1.0,
eps_min = .01,
eps_decay = 5e-5,
train_freq = 100,
eval_freq = 50)
plt.subplot(2,1,1)
plt.plot(train_scores,label="Train Scores")
plt.legend()
plt.subplot(212)
plt.plot(eval_scores,label="Eval Scores")
plt.legend()
plt.show()
env.close()