Hello everyone! I’m trying to make a RL agent using policy gradients to play pong from a segment of the memory.
Currently, I have this code:
import signal
import sys
import gym
import torch
from torch import nn
import numpy as np
device = "cuda" if torch.cuda.is_available() else "cpu"
print("using {}".format(device))
print("dev", torch.cuda.current_device())
env = gym.make('Pong-ram-v0')
render = False
# observation = env.reset()
significant_sectors = [2, 8, 9, 10, 11, 12, 13, 15, 17, 18, 19, 20, 21, 49, 50, 54, 56, 58, 67, 69, 73, 121, 122]
state_size = len(significant_sectors)
n = state_size
m = 2 # up or down
num_hidden_neurons = n + m + 2
num_output_neurons = m
def initialize_weights(m):
if type(m) == nn.Linear:
torch.nn.init.xavier_uniform_(m.weight)
torch.nn.init.zeros_(m.bias)
class Reinforcement(nn.Module):
def __init__(self):
super(Reinforcement, self).__init__()
self.net = nn.Sequential(
nn.Linear(n, num_hidden_neurons),
nn.ReLU(),
nn.Linear(num_hidden_neurons, num_hidden_neurons),
nn.ReLU(),
nn.Linear(num_hidden_neurons, num_output_neurons),
nn.Softmax(1)
)
self.net.apply(initialize_weights)
def forward(self, x):
return self.net(x)
model = Reinforcement()
def signal_handler(sig, frame):
print('saving state')
torch.save(model.state_dict(), 'state.torch')
sys.exit(0)
signal.signal(signal.SIGINT, signal_handler)
optim = torch.optim.SGD(model.net.parameters(), lr=1e-5, momentum=0.9)
for episode in range(20000):
predictions = []
observation = env.reset()
probs = []
rewards = []
actions = []
for i in range(1, 10000):
if render:
env.render()
significant_state = torch.from_numpy(np.asarray([observation[x] for x in significant_sectors])).float().unsqueeze(0)
policy = model(significant_state)
m = torch.distributions.Categorical(policy)
action = m.sample()
observation, reward, done, info = env.step(3 if action.item() >= 0.5 else 2)
actions.append(action.item())
probs.append(m.log_prob(action))
rewards.append(reward)
if done:
break
policy_loss = []
rewards = torch.tensor(rewards)
eps = np.finfo(np.float32).eps.item()
if episode % 10 == 0:
print('updating episode {}'.format(episode))
print('reward: {}'.format(torch.sum(rewards)))
rewards = (rewards - rewards.mean()) / (rewards.std() + eps)
for log_prob, reward in zip(probs, rewards):
policy_loss.append(-log_prob * reward)
optim.zero_grad()
loss = torch.cat(policy_loss).mean()
loss.backward()
optim.step()
if episode % 50 == 0:
updating = False
for p in model.parameters():
if torch.norm(p.grad) > 0:
updating = True
break
if updating is False:
print('stopped learning')
break
print('updating: {}'.format(updating))
I have looked at this PyTorch example for the cart pole and it basically solves the problem. I’m trying to replicate something like that but for the pong ram, to no luck.
I’m using the part of the memory that I found to be the only parts that would change over time, after running a few pong simulations. But even with the full memory, I still run into the same problems.
The problems I am facing: oftentimes this algorithm will have a zero norm for the gradient when running an episode and I don’t really know what the problem is. Also, when I remove the following line:
rewards = (rewards - rewards.mean()) / (rewards.std() + eps)
It will stop learning eventually by having that gradient with zero norm. I’m not sure if I committed any obvious mistake here. Any help would be invaluable to me.