Hello everyone,

I embarked on a project to teach a DQN to play the classic Pong from the OpenAI gym. I encountered a peculiar issue where the agent struggled to learn an effective strategy, often losing games. To break down the problem and understand it more intuitively, I designed an extremely simplified version of the Pong environment. It’s a basic 2x3 matrix where the ball consistently moves from right to left. The agent earns a reward for successfully hitting the ball and incurs a penalty for missing it.

However, even with such a straightforward setup, my DQN frequently falls into local minima. An observation I made was that when I use `numpy.seed()`

, the main factor for the agent’s learning capability becomes the seed value chosen initially. How could i make the learning process less depending on the seed value?

Thank you in advance for your advice and expertise!

```
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
np.random.seed(123)
class SimplePong:
def __init__(self):
self.reset()
def reset(self):
self.state = np.array([1, 0, 0, 0, 0, 0])
self.next_state = self.state.copy()
starting_positions = [2, 5]
np.random.shuffle(starting_positions)
self.state[starting_positions[0]] = 1
return self.state
def move_ball(self):
# Check ball's position and move it
if self.state[1] == 1: # Ball is in the top middle
self.next_state[1], self.next_state[0] = 0, 1
elif self.state[2] == 1: # Ball is in the top right
self.next_state[2], self.next_state[1] = 0, 1
elif self.state[4] == 1: # Ball is in the bottom middle
self.next_state[4], self.next_state[3] = 0, 1
elif self.state[5] == 1: # Ball is in the bottom right
self.next_state[5], self.next_state[4] = 0, 1
return self.next_state
def take_action(self, action):
if action == 1:
self.next_state = self.state.copy()
self.next_state[0], self.next_state[3] = self.next_state[3], self.next_state[0]
def get_reward(self):
if np.sum(self.next_state) == 1:
return 1 # Ball was hit by paddle
elif self.next_state[0] and self.next_state[3]:
return -1 # Ball passed the paddle
else:
return 0
def step(self, state, action, next_state):
self.state = state
self.next_state = next_state
"""Perform one game step."""
self.take_action(action)
self.next_state = self.move_ball()
reward = self.get_reward()
return self.state, reward, self.next_state
class DQNAgent(nn.Module):
def __init__(self):
super(DQNAgent, self).__init__()
self.input_size = 6
self.hidden_size = 4
self.output_size = 2
self.fc1 = nn.Linear(self.input_size, self.hidden_size)
self.fc2 = nn.Linear(self.hidden_size, self.output_size)
self.relu = nn.ReLU() # Activation function
self.gamma = 0.99
self.optimizer = optim.Adam(self.parameters(), lr=0.00005)
self.loss_fn = nn.MSELoss()
def forward(self, x):
x = self.fc1(x)
x = self.relu(x) # Apply activation after first layer
y_hat = self.fc2(x)
return y_hat
def get_action(self, x, epsilon=0.1):
if np.random.rand() < epsilon:
return np.random.choice([0, 1])
x = torch.tensor(x, dtype=torch.float32)
q_values = self.forward(x)
return torch.argmax(q_values).item()
def train(self, state, action, reward, next_state):
state = torch.tensor(state, dtype=torch.float32)
next_state = torch.tensor(next_state, dtype=torch.float32)
y_hat = self.forward(state)
next_q_values = self.forward(next_state)
target = y_hat.clone().detach()
target[action] = reward + self.gamma * torch.max(next_q_values)
loss = self.loss_fn(y_hat, target)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
if __name__ == '__main__':
env = SimplePong()
agent = DQNAgent()
episodes = 5000
rewards_history = []
for episode in range(episodes):
state = env.reset()
next_state = state.copy()
total_reward = 0
for _ in range(200):
epsilon = max(0.01, 0.1 - episode * 0.0001)
action = agent.get_action(state, epsilon)
state, reward, next_state = env.step(state, action, next_state)
agent.train(state, action, reward, next_state)
if reward != 0:
state = env.reset()
next_state = state.copy()
else:
state = next_state.copy()
total_reward += reward
rewards_history.append(total_reward)
if episode % 100 == 0:
avg_reward_last_100 = np.mean(rewards_history[-100:])
print(f"Episode {episode}, Average Reward (Last 100 episodes): {avg_reward_last_100}")
```