import argparse
import pickle
from collections import namedtuple
import matplotlib.pyplot as plt
import numpy as np
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
parser = argparse.ArgumentParser(description=‘Solve the Pendulum-v0 with DQN’)
parser.add_argument(
'--gamma', type=float, default=0.9, metavar='G', help='discount factor (default: 0.9)')
parser.add_argument(
'--num_actions', type=int, default=5, metavar='N', help='discretize action space (default: 5)')
parser.add_argument(’–seed’, type=int, default=0, metavar=‘N’, help=‘random seed (default: 0)’)
parser.add_argument(’–render’, action=‘store_true’, help=‘render the environment’)
parser.add_argument(
'--log-interval',
type=int,
default=10,
metavar='N',
help='interval between training status logs (default: 10)')
args = parser.parse_args()
torch.manual_seed(args.seed)
np.random.seed(args.seed)
TrainingRecord = namedtuple(‘TrainingRecord’, [‘ep’, ‘reward’])
Transition = namedtuple(‘Transition’, [‘s’, ‘a’, ‘r’, ‘s_’])
class Opt():
def __init__(self):
self.params = {}
self.params['W1'] = self._uniform_init(3, 100)
self.params['b1'] = np.zeros(100)
self.params['W2'] = self._uniform_init(100, 5)
self.params['b2'] = np.zeros(5)
self.grads = {}
self.grads['W1'] = self._uniform_init(3, 100)
self.grads['b1'] = np.zeros(100)
self.grads['W2'] = self._uniform_init(100, 5)
self.grads['b2'] = np.zeros(5)
self.optm_cfg ={}
self.optm_cfg['W1'] = None
self.optm_cfg['b1'] = None
self.optm_cfg['W2'] = None
self.optm_cfg['b2'] = None
def train(self):
self.params['W2'] = self._adam(self.params['W2'], self.grads['W2'], config=self.optm_cfg['W2'])[0]
self.params['W1'] = self._adam(self.params['W1'], self.grads['W1'], config=self.optm_cfg['W1'])[0]
self.params['b2'] = self._adam(self.params['b2'], self.grads['b2'], config=self.optm_cfg['b2'])[0]
self.params['b1'] = self._adam(self.params['b1'], self.grads['b1'], config=self.optm_cfg['b1'])[0]
#print("_a_b", self.params['b2'])
#print("_fc_b", self.params['b1'])
# Update the configuration parameters to be used in the next iteration
self.optm_cfg['W2'] = self._adam(self.params['W2'], self.grads['W2'], config=self.optm_cfg['W2'])[1]
self.optm_cfg['W1'] = self._adam(self.params['W1'], self.grads['W1'], config=self.optm_cfg['W1'])[1]
self.optm_cfg['b2'] = self._adam(self.params['b2'], self.grads['b2'], config=self.optm_cfg['b2'])[1]
self.optm_cfg['b1'] = self._adam(self.params['b1'], self.grads['b1'], config=self.optm_cfg['b1'])[1]
def _adam(self, x, dx, config=None):
if config is None: config = {}
config.setdefault('learning_rate', 1e-3)
config.setdefault('beta1', 0.9)
config.setdefault('beta2', 0.999)
config.setdefault('epsilon', 1e-8)
config.setdefault('m', np.zeros_like(x))
config.setdefault('v', np.zeros_like(x))
config.setdefault('t', 0)
next_x = None
#Adam update formula, #
config['t'] += 1
config['m'] = config['beta1']*config['m'] + (1-config['beta1'])*dx
config['v'] = config['beta2']*config['v'] + (1-config['beta2'])*(dx**2)
bias_corr1 = (1 - config['beta1']**config['t'])
bias_corr2 = (1 - config['beta2']**config['t'])
denom=(np.sqrt(config['v'])) / (np.sqrt(bias_corr2)+config['epsilon'])
step_size=config['learning_rate'] / bias_corr1
next_x= x + (-step_size)*(config['m']/denom)
if(len(next_x.shape)>1):
import math
for i in range(len(next_x[0])):
if(math.isnan(next_x[0,i])):
next_x[0,i]=x[0,i]
else:
import math
for i in range(len(next_x)):
if(math.isnan(next_x[i])):
next_x[i]=x[i]
return next_x, config
def _uniform_init(self, input_size, output_size):
u = np.sqrt(6./(input_size+output_size))
return np.random.uniform(-u, u, (input_size, output_size))
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc = nn.Linear(3, 100)
self.a_head = nn.Linear(100, args.num_actions)
def forward(self, x):
x = F.relu(self.fc(x))
a = self.a_head(x)
action_scores = a
return action_scores
class Memory():
data_pointer = 0
isfull = False
def __init__(self, capacity):
self.memory = np.empty(capacity, dtype=object)
self.capacity = capacity
def update(self, transition):
self.memory[self.data_pointer] = transition
self.data_pointer += 1
if self.data_pointer == self.capacity:
self.data_pointer = 0
self.isfull = True
def sample(self, batch_size):
return np.random.choice(self.memory, batch_size)
class Agent():
action_list = [( -2,) ,(2,) ,(3,) ,(-3,),(4,) ]
max_grad_norm = 0.5
def __init__(self):
self.training_step = 0
self.epsilon = 1
self.eval_net, self.target_net = Net().float(), Net().float()
self.memory = Memory(2000)
self.optimizer = optim.Adam(self.eval_net.parameters(), lr=1e-3)
self.opt=Opt()
def select_action(self, state):
state = torch.from_numpy(state).float().unsqueeze(0)
if np.random.random() < self.epsilon:
action_index = np.random.randint(args.num_actions)
else:
probs = self.eval_net(state)
action_index = probs.max(1)[1].item()
return self.action_list[action_index], action_index
def save_param(self):
torch.save(self.eval_net.state_dict(), 'param/dqn_net_params.pkl')
def store_transition(self, transition):
self.memory.update(transition)
def update(self):
self.training_step += 1
transitions = self.memory.sample(32)
s = torch.tensor([t.s for t in transitions], dtype=torch.float)
a = torch.tensor([t.a for t in transitions], dtype=torch.long).view(-1, 1)
r = torch.tensor([t.r for t in transitions], dtype=torch.float).view(-1, 1)
s_ = torch.tensor([t.s_ for t in transitions], dtype=torch.float)
# natural dqn
# q_eval = self.eval_net(s).gather(1, a)
# with torch.no_grad():
# q_target = r + args.gamma * self.target_net(s_).max(1, keepdim=True)[0]
# double dqn
with torch.no_grad():
a_ = self.eval_net(s_).max(1, keepdim=True)[1]
q_target = r + args.gamma * self.target_net(s_).gather(1, a_)
q_eval = self.eval_net(s).gather(1, a)
self.optimizer.zero_grad()
loss = F.smooth_l1_loss(q_eval, q_target)
loss.backward()
#nn.utils.clip_grad_norm_(self.eval_net.parameters(), self.max_grad_norm)
#self.optimizer.step()
with torch.no_grad():
for i in range(100):
for j in range(5):
self.opt.grads['b2'][j]=self.eval_net.a_head.bias.grad[j]
self.opt.grads['W2'][i,j]=self.eval_net.a_head.weight.grad[j,i]
for i in range(3):
for j in range(100):
self.opt.grads['b1'][j]=self.eval_net.fc.bias.grad[j]
self.opt.grads['W1'][i,j]=self.eval_net.fc.weight.grad[j,i]
self.opt.train()
for i in range(100):
for j in range(5):
self.eval_net.a_head.bias[j]=self.opt.params['b2'][j]
self.eval_net.a_head.weight[j,i]=self.opt.params['W2'][i,j]
for i in range(3):
for j in range(100):
self.eval_net.fc.bias[j]=self.opt.params['b1'][j]
self.eval_net.fc.weight[j,i]=self.opt.params['W1'][i,j]
if self.training_step % 200 == 0:
self.target_net.load_state_dict(self.eval_net.state_dict())
self.epsilon = max(self.epsilon * 0.999, 0.01)
return q_eval.mean().item()
def main():
env = gym.make('Pendulum-v0')
env.seed(args.seed)
agent = Agent()
training_records = []
running_reward, running_q = -1000, 0
state = env.reset()
for i_ep in range(100):
score = 0
for t in range(200000000000):
action, action_index = agent.select_action(state)
state_, reward, done, _ = env.step(action)
score += reward
if True:
env.render()
agent.store_transition(Transition(state, action_index, (reward + 8) / 8, state_))
state = state_
if agent.memory.isfull:
q = agent.update()
running_q = 0.99 * running_q + 0.01 * q
running_reward = running_reward * 0.9 + score * 0.1
training_records.append(TrainingRecord(i_ep, running_reward))
if i_ep % args.log_interval == 0:
print('Ep {}\tAverage score: {:.2f}\tAverage Q: {:.2f}'.format(
i_ep, running_reward, running_q))
if running_reward > -200:
print("Solved! Running reward is now {}!".format(running_reward))
env.close()
agent.save_param()
with open('log/dqn_training_records.pkl', 'wb') as f:
pickle.dump(training_records, f)
break
env.close()
plt.plot([r.ep for r in training_records], [r.reward for r in training_records])
plt.title('DQN')
plt.xlabel('Episode')
plt.ylabel('Moving averaged episode reward')
plt.savefig("img/dqn.png")
plt.show()
if name == ‘main’:
main()
I’ve implented adam by hand, not working why, if I uncomment the optimizer.step(), its working,
I copied this https://pytorch.org/docs/stable/_modules/torch/optim/adam.html#Adam.step