Difficulty Training DQN on Simplified Pong Environment - Prone to Local Minima

Hello everyone,

I embarked on a project to teach a DQN to play the classic Pong from the OpenAI gym. I encountered a peculiar issue where the agent struggled to learn an effective strategy, often losing games. To break down the problem and understand it more intuitively, I designed an extremely simplified version of the Pong environment. It’s a basic 2x3 matrix where the ball consistently moves from right to left. The agent earns a reward for successfully hitting the ball and incurs a penalty for missing it.

However, even with such a straightforward setup, my DQN frequently falls into local minima. An observation I made was that when I use numpy.seed(), the main factor for the agent’s learning capability becomes the seed value chosen initially. How could i make the learning process less depending on the seed value?

Thank you in advance for your advice and expertise!

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

np.random.seed(123)

class SimplePong:
    def __init__(self):
        self.reset()

    def reset(self):
        self.state = np.array([1, 0, 0, 0, 0, 0])
        self.next_state = self.state.copy()
        starting_positions = [2, 5]
        np.random.shuffle(starting_positions)
        self.state[starting_positions[0]] = 1
        return self.state

    def move_ball(self):
        # Check ball's position and move it
        if self.state[1] == 1:  # Ball is in the top middle
            self.next_state[1], self.next_state[0] = 0, 1
        elif self.state[2] == 1:  # Ball is in the top right
            self.next_state[2], self.next_state[1] = 0, 1
        elif self.state[4] == 1:  # Ball is in the bottom middle
            self.next_state[4], self.next_state[3] = 0, 1
        elif self.state[5] == 1:  # Ball is in the bottom right
            self.next_state[5], self.next_state[4] = 0, 1
        return self.next_state

    def take_action(self, action):
        if action == 1:
            self.next_state = self.state.copy()
            self.next_state[0], self.next_state[3] = self.next_state[3], self.next_state[0]

    def get_reward(self):
        if np.sum(self.next_state) == 1:
            return 1  # Ball was hit by paddle
        elif self.next_state[0] and self.next_state[3]:
            return -1  # Ball passed the paddle
        else:
            return 0

    def step(self, state, action, next_state):
        self.state = state
        self.next_state = next_state
        """Perform one game step."""
        self.take_action(action)
        self.next_state = self.move_ball()
        reward = self.get_reward()
        return self.state, reward, self.next_state

class DQNAgent(nn.Module):
    def __init__(self):
        super(DQNAgent, self).__init__()
        self.input_size = 6
        self.hidden_size = 4
        self.output_size = 2

        self.fc1 = nn.Linear(self.input_size, self.hidden_size)
        self.fc2 = nn.Linear(self.hidden_size, self.output_size)
        self.relu = nn.ReLU()  # Activation function

        self.gamma = 0.99
        self.optimizer = optim.Adam(self.parameters(), lr=0.00005)
        self.loss_fn = nn.MSELoss()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)  # Apply activation after first layer
        y_hat = self.fc2(x)
        return y_hat

    def get_action(self, x, epsilon=0.1):
        if np.random.rand() < epsilon:
            return np.random.choice([0, 1])
        
        x = torch.tensor(x, dtype=torch.float32)
        q_values = self.forward(x)
        return torch.argmax(q_values).item()

    def train(self, state, action, reward, next_state):
        state = torch.tensor(state, dtype=torch.float32)
        next_state = torch.tensor(next_state, dtype=torch.float32)

        y_hat = self.forward(state)
        next_q_values = self.forward(next_state)

        target = y_hat.clone().detach()
        target[action] = reward + self.gamma * torch.max(next_q_values)

        loss = self.loss_fn(y_hat, target)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()


if __name__ == '__main__':
    env = SimplePong()
    agent = DQNAgent()
    episodes = 5000
    rewards_history = []

    for episode in range(episodes):
        state = env.reset()
        next_state = state.copy()
        total_reward = 0

        for _ in range(200):
            epsilon = max(0.01, 0.1 - episode * 0.0001)
            action = agent.get_action(state, epsilon)
            state, reward, next_state = env.step(state, action, next_state)

            agent.train(state, action, reward, next_state)

            if reward != 0:
                state = env.reset()
                next_state = state.copy()
            else:
                state = next_state.copy()

            total_reward += reward

        rewards_history.append(total_reward)

        if episode % 100 == 0:
            avg_reward_last_100 = np.mean(rewards_history[-100:])
            print(f"Episode {episode}, Average Reward (Last 100 episodes): {avg_reward_last_100}")

I now get better results. In the end it was just quite a lot of hyperparameter tuning :wink: