I’m trying to make a deep Q network for the Lunar Lander v2 environment. I got the model to run, but it isn’t converging at all; the loss and environment rewards aren’t improving. What’s wrong with my code?
Here’s a link to my code on Colab: Google Colab
Some of the more important parts:
def __init__(self):
super(ActorCritic, self).__init__()
self.action_layer = nn.Linear(128, 4)
self.value_layer = nn.Linear(128, 1)
self.state_size = 8
self.action_size = 4
self.state_dense = nn.Linear(self.state_size, 256)
self.action_dense = nn.Linear(self.action_size, 256)
self.hl1 = nn.Linear(256, 512)
self.dp1 = nn.Dropout(0.25)
self.hl2 = nn.Linear(512, 512)
self.dp2 = nn.Dropout(0.25)
self.outputs = nn.Linear(512, 1)
self.state_values = []
self.rewards = []
def forward(self, state, action):
state = torch.from_numpy(state).float()
state = F.relu(self.state_dense(state))
action = np.array(action)
action = torch.from_numpy(action).float()
action = F.relu(self.action_dense(action))
combined = torch.mul(state, action)
value = F.relu(self.hl1(combined))
value = self.dp1(value)
value = F.relu(self.hl2(value))
value = self.dp2(value)
value = self.outputs(value)
self.state_values.append(value)
return value
def calculateLoss(self, gamma=0.99):
# calculating discounted rewards:
rewards = []
dis_reward = 0
for reward in self.rewards[::-1]:
dis_reward = reward + gamma * dis_reward
rewards.insert(0, dis_reward)
# normalizing the rewards:
rewards = torch.tensor(rewards)
rewards = (rewards - rewards.mean()) / (rewards.std())
loss = 0
for value, reward in zip(self.state_values, rewards):
value = value.to(torch.float64)
value_loss = F.mse_loss(value, reward)
loss += value_loss
return loss
How I update the network:
optimizer.zero_grad()
loss = policy.calculateLoss(gamma)
loss.backward()
optimizer.step()
policy.clearMemory()