Hello everyone,
I embarked on a project to teach a DQN to play the classic Pong from the OpenAI gym. I encountered a peculiar issue where the agent struggled to learn an effective strategy, often losing games. To break down the problem and understand it more intuitively, I designed an extremely simplified version of the Pong environment. It’s a basic 2x3 matrix where the ball consistently moves from right to left. The agent earns a reward for successfully hitting the ball and incurs a penalty for missing it.
However, even with such a straightforward setup, my DQN frequently falls into local minima. An observation I made was that when I use numpy.seed()
, the main factor for the agent’s learning capability becomes the seed value chosen initially. How could i make the learning process less depending on the seed value?
Thank you in advance for your advice and expertise!
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
np.random.seed(123)
class SimplePong:
def __init__(self):
self.reset()
def reset(self):
self.state = np.array([1, 0, 0, 0, 0, 0])
self.next_state = self.state.copy()
starting_positions = [2, 5]
np.random.shuffle(starting_positions)
self.state[starting_positions[0]] = 1
return self.state
def move_ball(self):
# Check ball's position and move it
if self.state[1] == 1: # Ball is in the top middle
self.next_state[1], self.next_state[0] = 0, 1
elif self.state[2] == 1: # Ball is in the top right
self.next_state[2], self.next_state[1] = 0, 1
elif self.state[4] == 1: # Ball is in the bottom middle
self.next_state[4], self.next_state[3] = 0, 1
elif self.state[5] == 1: # Ball is in the bottom right
self.next_state[5], self.next_state[4] = 0, 1
return self.next_state
def take_action(self, action):
if action == 1:
self.next_state = self.state.copy()
self.next_state[0], self.next_state[3] = self.next_state[3], self.next_state[0]
def get_reward(self):
if np.sum(self.next_state) == 1:
return 1 # Ball was hit by paddle
elif self.next_state[0] and self.next_state[3]:
return -1 # Ball passed the paddle
else:
return 0
def step(self, state, action, next_state):
self.state = state
self.next_state = next_state
"""Perform one game step."""
self.take_action(action)
self.next_state = self.move_ball()
reward = self.get_reward()
return self.state, reward, self.next_state
class DQNAgent(nn.Module):
def __init__(self):
super(DQNAgent, self).__init__()
self.input_size = 6
self.hidden_size = 4
self.output_size = 2
self.fc1 = nn.Linear(self.input_size, self.hidden_size)
self.fc2 = nn.Linear(self.hidden_size, self.output_size)
self.relu = nn.ReLU() # Activation function
self.gamma = 0.99
self.optimizer = optim.Adam(self.parameters(), lr=0.00005)
self.loss_fn = nn.MSELoss()
def forward(self, x):
x = self.fc1(x)
x = self.relu(x) # Apply activation after first layer
y_hat = self.fc2(x)
return y_hat
def get_action(self, x, epsilon=0.1):
if np.random.rand() < epsilon:
return np.random.choice([0, 1])
x = torch.tensor(x, dtype=torch.float32)
q_values = self.forward(x)
return torch.argmax(q_values).item()
def train(self, state, action, reward, next_state):
state = torch.tensor(state, dtype=torch.float32)
next_state = torch.tensor(next_state, dtype=torch.float32)
y_hat = self.forward(state)
next_q_values = self.forward(next_state)
target = y_hat.clone().detach()
target[action] = reward + self.gamma * torch.max(next_q_values)
loss = self.loss_fn(y_hat, target)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
if __name__ == '__main__':
env = SimplePong()
agent = DQNAgent()
episodes = 5000
rewards_history = []
for episode in range(episodes):
state = env.reset()
next_state = state.copy()
total_reward = 0
for _ in range(200):
epsilon = max(0.01, 0.1 - episode * 0.0001)
action = agent.get_action(state, epsilon)
state, reward, next_state = env.step(state, action, next_state)
agent.train(state, action, reward, next_state)
if reward != 0:
state = env.reset()
next_state = state.copy()
else:
state = next_state.copy()
total_reward += reward
rewards_history.append(total_reward)
if episode % 100 == 0:
avg_reward_last_100 = np.mean(rewards_history[-100:])
print(f"Episode {episode}, Average Reward (Last 100 episodes): {avg_reward_last_100}")