# Difficulty Training DQN on Simplified Pong Environment - Prone to Local Minima

Hello everyone,

I embarked on a project to teach a DQN to play the classic Pong from the OpenAI gym. I encountered a peculiar issue where the agent struggled to learn an effective strategy, often losing games. To break down the problem and understand it more intuitively, I designed an extremely simplified version of the Pong environment. It’s a basic 2x3 matrix where the ball consistently moves from right to left. The agent earns a reward for successfully hitting the ball and incurs a penalty for missing it.

However, even with such a straightforward setup, my DQN frequently falls into local minima. An observation I made was that when I use `numpy.seed()`, the main factor for the agent’s learning capability becomes the seed value chosen initially. How could i make the learning process less depending on the seed value?

``````import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

np.random.seed(123)

class SimplePong:
def __init__(self):
self.reset()

def reset(self):
self.state = np.array([1, 0, 0, 0, 0, 0])
self.next_state = self.state.copy()
starting_positions = [2, 5]
np.random.shuffle(starting_positions)
self.state[starting_positions[0]] = 1
return self.state

def move_ball(self):
# Check ball's position and move it
if self.state[1] == 1:  # Ball is in the top middle
self.next_state[1], self.next_state[0] = 0, 1
elif self.state[2] == 1:  # Ball is in the top right
self.next_state[2], self.next_state[1] = 0, 1
elif self.state[4] == 1:  # Ball is in the bottom middle
self.next_state[4], self.next_state[3] = 0, 1
elif self.state[5] == 1:  # Ball is in the bottom right
self.next_state[5], self.next_state[4] = 0, 1
return self.next_state

def take_action(self, action):
if action == 1:
self.next_state = self.state.copy()
self.next_state[0], self.next_state[3] = self.next_state[3], self.next_state[0]

def get_reward(self):
if np.sum(self.next_state) == 1:
return 1  # Ball was hit by paddle
elif self.next_state[0] and self.next_state[3]:
return -1  # Ball passed the paddle
else:
return 0

def step(self, state, action, next_state):
self.state = state
self.next_state = next_state
"""Perform one game step."""
self.take_action(action)
self.next_state = self.move_ball()
reward = self.get_reward()
return self.state, reward, self.next_state

class DQNAgent(nn.Module):
def __init__(self):
super(DQNAgent, self).__init__()
self.input_size = 6
self.hidden_size = 4
self.output_size = 2

self.fc1 = nn.Linear(self.input_size, self.hidden_size)
self.fc2 = nn.Linear(self.hidden_size, self.output_size)
self.relu = nn.ReLU()  # Activation function

self.gamma = 0.99
self.loss_fn = nn.MSELoss()

def forward(self, x):
x = self.fc1(x)
x = self.relu(x)  # Apply activation after first layer
y_hat = self.fc2(x)
return y_hat

def get_action(self, x, epsilon=0.1):
if np.random.rand() < epsilon:
return np.random.choice([0, 1])

x = torch.tensor(x, dtype=torch.float32)
q_values = self.forward(x)

def train(self, state, action, reward, next_state):
state = torch.tensor(state, dtype=torch.float32)
next_state = torch.tensor(next_state, dtype=torch.float32)

y_hat = self.forward(state)
next_q_values = self.forward(next_state)

target = y_hat.clone().detach()
target[action] = reward + self.gamma * torch.max(next_q_values)

loss = self.loss_fn(y_hat, target)

loss.backward()
self.optimizer.step()

if __name__ == '__main__':
env = SimplePong()
agent = DQNAgent()
episodes = 5000
rewards_history = []

for episode in range(episodes):
state = env.reset()
next_state = state.copy()
total_reward = 0

for _ in range(200):
epsilon = max(0.01, 0.1 - episode * 0.0001)
action = agent.get_action(state, epsilon)
state, reward, next_state = env.step(state, action, next_state)

agent.train(state, action, reward, next_state)

if reward != 0:
state = env.reset()
next_state = state.copy()
else:
state = next_state.copy()

total_reward += reward

rewards_history.append(total_reward)

if episode % 100 == 0:
avg_reward_last_100 = np.mean(rewards_history[-100:])
print(f"Episode {episode}, Average Reward (Last 100 episodes): {avg_reward_last_100}")

``````

I now get better results. In the end it was just quite a lot of hyperparameter tuning