Reinforcement learning DQN *super* slow

Hello,

I’m not very experienced with neural networks, this is my first project within this field.

I’ve followed the official tutorial to create a DQN. It doesn’t play CartPole, though, but rather a very simplified version of a certain card game, where it had a hand of random cards and has to consistently pick the largest number in order to win.

The model’s learning ability (or lack thereof) aside, it runs painfully slow. Like a few orders of magnitude slower than the same model in keras.

I’m not sure what to do with that. cProfiler showed that the DQAgent.make_optimization_step function had the most cumulative time. I tried just putting in random values in place of all the tensor operations for a second, but even then it merely sped up about 5 times, which was still way too slow.

class DQN(nn.Module):
	def __init__(self, input_length, output_length, hl1_length=256, hl2_length=256):
		"""
		The current state is the input and action probabilities are the outputs.
		"""
		super(DQN, self).__init__()

		# To check the device being used
		self.__dummy_param = nn.Parameter(torch.empty(0))

		self.hl1 = nn.Linear(input_length, hl1_length)
		self.hl2 = nn.Linear(hl1_length, hl2_length)
		self.outputLayer = nn.Linear(hl2_length, output_length)

		self.to(device)

		log.trace('DQN #{} | Neural network initialized: {}'.format(id(self), self))

	def forward(self, x):
		x = F.relu(self.hl1(x))
		x = F.relu(self.hl2(x))
		x = torch.sigmoid(self.outputLayer(x))

		return x.view(x.size(0), -1)

	def get_device(self):
		return self.__dummy_param.device
class ReplayMemory:
	def __init__(self, capacity):
		self.capacity = capacity
		self.memory = []
		self.position = 0

		log.trace('ReplayMemory #{} | Initialized with capacity {}'.format(id(self), capacity))

	@staticmethod
	def calculate_capacity(availableBytes):
		N_safe = int(availableBytes // transitionEstimatedBytes)
		return N_safe

	def push(self, *args):
		if len(self.memory) < self.capacity:
			self.memory.append(None)

		self.memory[self.position] = Transition(*args)
		self.position = (self.position + 1) % self.capacity

	def sample(self, batch_size):
		return random.sample(self.memory, batch_size)

	def __len__(self):
		return len(self.memory)
class DQAgent:
	def __init__(self, env, hl1_length=256, hl2_length=256,
					gamma=0.999, eps_start=0.9, eps_end=0.05, eps_decay_rate=200,
					memory_length=None, memory_bytes=None, batch_size=128,
					episodes_between_updates=10):

		self.env = env
		self._state_length = env.getStateLength()
		self._n_actions = env.getActionSpaceLength()

		self.gamma = gamma
		self.eps_start = eps_start
		self.eps_end = eps_end
		self.eps_decay_rate = eps_decay_rate
		self.episodes_between_updates = episodes_between_updates

		self._steps = 0

		# Memory can be set by giving either a number of Transitions stored or
		#  the number of bytes available.
		memA = memory_length or 0
		memB = ReplayMemory.calculate_capacity(memory_bytes or 0)
		memory_capacity = (min(memA, memB) if (memA and memB) else (memA or memB)) or DEFAULT_MEMORY_CAPACITY
		self.memory = ReplayMemory(memory_capacity)
		self.batch_size = batch_size

		# Two neural networks are used - the target network only gets updated
		#  every so often rather than after every single move.
		self.policy_nn = DQN(self._state_length, self._n_actions, hl1_length, hl2_length)
		self.target_nn = DQN(self._state_length, self._n_actions, hl1_length, hl2_length)
		self.target_nn.load_state_dict(self.policy_nn.state_dict())
		self.target_nn.eval()

		self.optimizer = optim.RMSprop(self.policy_nn.parameters())

		# Store all episode rewards to later see how well the model was doing
		self.episode_rewards_history = []

		log.debug('DQAgent created (#{})'.format(id(self)))

	def _get_epsilon_threshold(self):
		decay_factor = math.exp(-1.0 * self._steps / self.eps_decay_rate)
		eps_threshold = self.eps_end + decay_factor * (self.eps_start - self.eps_end)
		return eps_threshold

	def get_device(self):
		return self.policy_nn.get_device()

	def make_optimization_step(self):
		if len(self.memory) < self.batch_size:
			return

		transitions_sample = self.memory.sample(self.batch_size)

		# Transpose a list of transitions into a transition of lists
		batch = Transition(*zip(*transitions_sample))

		non_final_states_mask = torch.tensor([s is not None for s in batch.next_state], device=self.get_device(), dtype=torch.bool)
		non_final_next_states = torch.cat([s.unsqueeze(0) for s in batch.next_state if s is not None])
		state_batch = torch.cat([s.unsqueeze(0) for s in batch.state])
		action_batch = torch.tensor(batch.action, device=self.get_device())
		reward_batch = torch.tensor(batch.reward, device=self.get_device())

		# Q-values are state-action values
		action_values = self.policy_nn(state_batch)
		q_values = []
		for q, actionIdx in zip(action_values, action_batch):
			actionIdx = int(actionIdx)
			q_values.append(q[actionIdx])
		q_values = torch.tensor(q_values, device=self.get_device(), requires_grad=True)

		next_state_values = torch.zeros(self.batch_size, device=self.get_device())
		next_state_values[non_final_states_mask] = self.target_nn(non_final_next_states).max(1)[0].detach()
		expected_q_values = (next_state_values * self.gamma) + reward_batch

		loss = F.smooth_l1_loss(q_values, expected_q_values)

		# Optimize the model
		self.optimizer.zero_grad()
		loss.backward()
		for param in self.policy_nn.parameters():
			if param.grad is not None:
				param.grad.data.clamp_(-1, 1)
		self.optimizer.step()

		# Memory cleanup
		del non_final_states_mask
		del non_final_next_states
		del state_batch
		del action_batch
		del reward_batch
		del q_values
		del next_state_values
		del expected_q_values
		del loss
		torch.cuda.empty_cache()

	def select_action(self, state):
		eps_threshold = self._get_epsilon_threshold()
		self._steps += 1

		if random.random() > eps_threshold:
			# Choose the best action as per the model
			with torch.no_grad():
				action_vals = self.policy_nn(state)
				best_action_idx = action_vals.max(0)[1].view(1)
				return best_action_idx
		else:
			# Choose at random
			random_action_idx = torch.tensor([random.randrange(self._n_actions)], device=self.get_device(), dtype=torch.float32)
			return random_action_idx

	def train(self, num_episodes, pbar=None):
		log.debug('DQAgent #{} | Training for {} episodes...'.format(id(self), num_episodes))
		for i_episode in range(num_episodes):
			self.env.reset()
			state = torch.tensor(self.env.state.toList(), device=self.get_device(), dtype=torch.float32)

			# A single reward that goes into the episode_reward_sum list is a
			#  sum of rewards for each transition within a single episode.
			episode_reward_sum = 0.0

			for t in count():
				# Select & perform an action
				action = self.select_action(state)
				new_state, reward, done, info = self.env.step(int(action[0]))
				new_state = torch.tensor(new_state, device=self.get_device(), dtype=torch.float32)
				reward = torch.tensor([ reward ], device=self.get_device())

				# Count the reward toward keeping track of history
				episode_reward_sum += reward[0]

				# Store the transition in memory
				self.memory.push(state, action, new_state, reward)

				# Move to the next state
				state = new_state

				# Perform one optimization step on policy network
				self.make_optimization_step()

				# Check if the episode has ended
				if done:
					break

			# Every N episodes, update the target network
			if i_episode % self.episodes_between_updates == 0:
				self.target_nn.load_state_dict(self.policy_nn.state_dict())

			# If there is a progress bar, update it
			if pbar is not None:
				pbar.update(1)

			# Remember the cumulative reward received for this episode
			self.episode_rewards_history.append(episode_reward_sum)

		# Cleanup
		self.env.close()
		if pbar is not None:
			pbar.close()

		# Flatten the episode_rewards_history into one tensor
		self.episode_rewards_history = torch.tensor(self.episode_rewards_history, device=self.get_device())

		log.debug('DQAgent #{} | Training complete'.format(id(self)))