Hi, I’m developing an agent for a reinforcement learning problem. It uses a custom environment at gym
with continuous state space and continuous action space. The resulting action is a vector [u, v]
, which is a command for the bot to perform. Since the input to the network is coordination, I couldn’t use the ReLu as an activation function. So, I used LeakyReLu with negative slope of 1 (which made it a Linear function).
Here is my model:
class DQN(nn.Module):
def __init__(self, in_features=6, hidden_layers=[15,15,15,15,15,10, 10, 8, 4], outputs=2):
super().__init__()
self.first_layer = nn.Linear(in_features, hidden_layers[0])
self.hidden1 = nn.Linear(hidden_layers[0], hidden_layers[1])
self.hidden2 = nn.Linear(hidden_layers[1], hidden_layers[2])
self.hidden3 = nn.Linear(hidden_layers[2], hidden_layers[3])
self.hidden4 = nn.Linear(hidden_layers[3], hidden_layers[4])
self.hidden5 = nn.Linear(hidden_layers[4], hidden_layers[5])
self.hidden6 = nn.Linear(hidden_layers[5], hidden_layers[6])
self.hidden7 = nn.Linear(hidden_layers[6], hidden_layers[7])
self.hidden8 = nn.Linear(hidden_layers[7], hidden_layers[8])
self.output = nn.Linear(hidden_layers[8], outputs)
def forward(self, x):
x = x.to(device)
x = F.leaky_relu(self.first_layer(x), negative_slope=1)
x = F.leaky_relu(self.hidden1(x), negative_slope=1)
x = F.leaky_relu(self.hidden2(x), negative_slope=1)
x = F.sigmoid(self.hidden3(x))
x = F.sigmoid(self.hidden4(x))
x = F.sigmoid(self.hidden5(x))
x = F.sigmoid(self.hidden6(x))
x = F.sigmoid(self.hidden7(x))
x = F.leaky_relu(self.hidden8(x), negative_slope=1)
x = F.leaky_relu(self.output(x), negative_slope=1)
return x
And this is my optimization function:
def optimize_model():
print(len(memory))
if len(memory) < BATCH_SIZE:
return
transitions = memory.sample(BATCH_SIZE)
batch = Transition(*zip(*transitions))
non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
batch.next_state)), device=device, dtype=torch.bool)
non_final_next_states = torch.cat([s for s in batch.next_state
if s is not None])
state_batch = torch.cat(batch.state)
action_batch = torch.cat(batch.action)
reward_batch = torch.cat(batch.reward)
reshaped_reward_batch = torch.cat(
(
reward_batch.reshape(reward_batch.shape[0],1),
reward_batch.reshape(reward_batch.shape[0],1)
), 1
)
state_action_values = policy_net(state_batch)
next_state_values = torch.zeros((BATCH_SIZE, 2), device=device)
next_state_values[non_final_mask] = target_net(non_final_next_states).detach()
expected_state_action_values = (next_state_values * GAMMA) + reshaped_reward_batch
criterion = nn.SmoothL1Loss()
loss = criterion(state_action_values, expected_state_action_values)
print('Loss', loss.item())
# Optimize the model
optimizer.zero_grad()
loss.backward()
for param in policy_net.parameters():
param.grad.data.clamp_(-1, 1)
optimizer.step()
And this is my training loop:
for i_episode in range(num_episodes):
state = env.reset()
state = torch.tensor([state], device=device, dtype=torch.float)
for t in count():
action = select_action(state)
action_cpu = action.cpu()
state, reward, done, info = env.step(action_cpu[0])
state = torch.tensor([state], device=device, dtype=torch.float)
reward = torch.tensor([reward], device=device, dtype=torch.float)
if not done:
next_state = state
else:
next_state = None
# Store the transition in memory
memory.push(state, action, next_state, reward)
state = next_state
min_distance = min(info["distance"], min_distance)
print(f'Eposide: {i_episode}, Iteration: {t}, Reward: {reward[0]}')
print(f'Action: {action_cpu[0]}')
print(f'Min Distance from target: {min_distance}\n')
optimize_model()
steps += 1
if done:
episode_durations.append(t + 1)
break
# Update the target network, copying all weights and biases in DQN
if steps % STEPS_TO_UPDATE_TARGET_NET == 0:
target_net.load_state_dict(policy_net.state_dict())
The problem is the loss keeps growing in this network instead of decreasing. Any idea about the reason and solution?