Hello,
I’m not very experienced with neural networks, this is my first project within this field.
I’ve followed the official tutorial to create a DQN. It doesn’t play CartPole, though, but rather a very simplified version of a certain card game, where it had a hand of random cards and has to consistently pick the largest number in order to win.
The model’s learning ability (or lack thereof) aside, it runs painfully slow. Like a few orders of magnitude slower than the same model in keras.
I’m not sure what to do with that. cProfiler showed that the DQAgent.make_optimization_step function had the most cumulative time. I tried just putting in random values in place of all the tensor operations for a second, but even then it merely sped up about 5 times, which was still way too slow.
class DQN(nn.Module):
def __init__(self, input_length, output_length, hl1_length=256, hl2_length=256):
"""
The current state is the input and action probabilities are the outputs.
"""
super(DQN, self).__init__()
# To check the device being used
self.__dummy_param = nn.Parameter(torch.empty(0))
self.hl1 = nn.Linear(input_length, hl1_length)
self.hl2 = nn.Linear(hl1_length, hl2_length)
self.outputLayer = nn.Linear(hl2_length, output_length)
self.to(device)
log.trace('DQN #{} | Neural network initialized: {}'.format(id(self), self))
def forward(self, x):
x = F.relu(self.hl1(x))
x = F.relu(self.hl2(x))
x = torch.sigmoid(self.outputLayer(x))
return x.view(x.size(0), -1)
def get_device(self):
return self.__dummy_param.device
class ReplayMemory:
def __init__(self, capacity):
self.capacity = capacity
self.memory = []
self.position = 0
log.trace('ReplayMemory #{} | Initialized with capacity {}'.format(id(self), capacity))
@staticmethod
def calculate_capacity(availableBytes):
N_safe = int(availableBytes // transitionEstimatedBytes)
return N_safe
def push(self, *args):
if len(self.memory) < self.capacity:
self.memory.append(None)
self.memory[self.position] = Transition(*args)
self.position = (self.position + 1) % self.capacity
def sample(self, batch_size):
return random.sample(self.memory, batch_size)
def __len__(self):
return len(self.memory)
class DQAgent:
def __init__(self, env, hl1_length=256, hl2_length=256,
gamma=0.999, eps_start=0.9, eps_end=0.05, eps_decay_rate=200,
memory_length=None, memory_bytes=None, batch_size=128,
episodes_between_updates=10):
self.env = env
self._state_length = env.getStateLength()
self._n_actions = env.getActionSpaceLength()
self.gamma = gamma
self.eps_start = eps_start
self.eps_end = eps_end
self.eps_decay_rate = eps_decay_rate
self.episodes_between_updates = episodes_between_updates
self._steps = 0
# Memory can be set by giving either a number of Transitions stored or
# the number of bytes available.
memA = memory_length or 0
memB = ReplayMemory.calculate_capacity(memory_bytes or 0)
memory_capacity = (min(memA, memB) if (memA and memB) else (memA or memB)) or DEFAULT_MEMORY_CAPACITY
self.memory = ReplayMemory(memory_capacity)
self.batch_size = batch_size
# Two neural networks are used - the target network only gets updated
# every so often rather than after every single move.
self.policy_nn = DQN(self._state_length, self._n_actions, hl1_length, hl2_length)
self.target_nn = DQN(self._state_length, self._n_actions, hl1_length, hl2_length)
self.target_nn.load_state_dict(self.policy_nn.state_dict())
self.target_nn.eval()
self.optimizer = optim.RMSprop(self.policy_nn.parameters())
# Store all episode rewards to later see how well the model was doing
self.episode_rewards_history = []
log.debug('DQAgent created (#{})'.format(id(self)))
def _get_epsilon_threshold(self):
decay_factor = math.exp(-1.0 * self._steps / self.eps_decay_rate)
eps_threshold = self.eps_end + decay_factor * (self.eps_start - self.eps_end)
return eps_threshold
def get_device(self):
return self.policy_nn.get_device()
def make_optimization_step(self):
if len(self.memory) < self.batch_size:
return
transitions_sample = self.memory.sample(self.batch_size)
# Transpose a list of transitions into a transition of lists
batch = Transition(*zip(*transitions_sample))
non_final_states_mask = torch.tensor([s is not None for s in batch.next_state], device=self.get_device(), dtype=torch.bool)
non_final_next_states = torch.cat([s.unsqueeze(0) for s in batch.next_state if s is not None])
state_batch = torch.cat([s.unsqueeze(0) for s in batch.state])
action_batch = torch.tensor(batch.action, device=self.get_device())
reward_batch = torch.tensor(batch.reward, device=self.get_device())
# Q-values are state-action values
action_values = self.policy_nn(state_batch)
q_values = []
for q, actionIdx in zip(action_values, action_batch):
actionIdx = int(actionIdx)
q_values.append(q[actionIdx])
q_values = torch.tensor(q_values, device=self.get_device(), requires_grad=True)
next_state_values = torch.zeros(self.batch_size, device=self.get_device())
next_state_values[non_final_states_mask] = self.target_nn(non_final_next_states).max(1)[0].detach()
expected_q_values = (next_state_values * self.gamma) + reward_batch
loss = F.smooth_l1_loss(q_values, expected_q_values)
# Optimize the model
self.optimizer.zero_grad()
loss.backward()
for param in self.policy_nn.parameters():
if param.grad is not None:
param.grad.data.clamp_(-1, 1)
self.optimizer.step()
# Memory cleanup
del non_final_states_mask
del non_final_next_states
del state_batch
del action_batch
del reward_batch
del q_values
del next_state_values
del expected_q_values
del loss
torch.cuda.empty_cache()
def select_action(self, state):
eps_threshold = self._get_epsilon_threshold()
self._steps += 1
if random.random() > eps_threshold:
# Choose the best action as per the model
with torch.no_grad():
action_vals = self.policy_nn(state)
best_action_idx = action_vals.max(0)[1].view(1)
return best_action_idx
else:
# Choose at random
random_action_idx = torch.tensor([random.randrange(self._n_actions)], device=self.get_device(), dtype=torch.float32)
return random_action_idx
def train(self, num_episodes, pbar=None):
log.debug('DQAgent #{} | Training for {} episodes...'.format(id(self), num_episodes))
for i_episode in range(num_episodes):
self.env.reset()
state = torch.tensor(self.env.state.toList(), device=self.get_device(), dtype=torch.float32)
# A single reward that goes into the episode_reward_sum list is a
# sum of rewards for each transition within a single episode.
episode_reward_sum = 0.0
for t in count():
# Select & perform an action
action = self.select_action(state)
new_state, reward, done, info = self.env.step(int(action[0]))
new_state = torch.tensor(new_state, device=self.get_device(), dtype=torch.float32)
reward = torch.tensor([ reward ], device=self.get_device())
# Count the reward toward keeping track of history
episode_reward_sum += reward[0]
# Store the transition in memory
self.memory.push(state, action, new_state, reward)
# Move to the next state
state = new_state
# Perform one optimization step on policy network
self.make_optimization_step()
# Check if the episode has ended
if done:
break
# Every N episodes, update the target network
if i_episode % self.episodes_between_updates == 0:
self.target_nn.load_state_dict(self.policy_nn.state_dict())
# If there is a progress bar, update it
if pbar is not None:
pbar.update(1)
# Remember the cumulative reward received for this episode
self.episode_rewards_history.append(episode_reward_sum)
# Cleanup
self.env.close()
if pbar is not None:
pbar.close()
# Flatten the episode_rewards_history into one tensor
self.episode_rewards_history = torch.tensor(self.episode_rewards_history, device=self.get_device())
log.debug('DQAgent #{} | Training complete'.format(id(self)))