Hi,
I am trying to develop A3C reinforcement learning in pytorch.
But I am getting same action being taken on multinomial as well as categorical sampling of actions.
Below is my code…I need help in understanding whether simple A3C works well in continuous state space as well.
Thanks,
Granth
while count<max_timesteps-1:
episode_length += 1
if done:
cx = Variable(torch.zeros(1, params.state_dim))
hx = Variable(torch.zeros(1, params.state_dim))
else:
cx = Variable(cx.data)
hx = Variable(hx.data)
values = []
log_probs = []
rewards = []
entropies = []
while count<max_timesteps-1:
value, action_values, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx)))
prob = F.softmax(action_values,dim = -1)
log_prob = F.log_softmax(action_values, dim=-1)
entropy = -(log_prob * prob).sum(1, keepdim=True)
entropies.append(entropy)
cdist = categorical.Categorical(prob)
action = cdist.sample()
log_prob = log_prob[0, Variable(action)].data
state, reward, done = env.step(action)
done = (done or count == max_timesteps-2)
reward = max(min(reward, 1), -1)
count +=1
if done:
episode_length = 0
state = env.reset()
values.append(value)
log_probs.append(log_prob)
rewards.append(reward)
print(ticker," action:",action, "reward ",reward)
if done:
break
R = torch.zeros(1, 1)
if not done:
value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx)))
R = value.data
values.append(Variable(R))
policy_loss = 0
value_loss = 0
R = Variable(R)
gae = torch.zeros(1, 1)
for i in reversed(range(len(rewards))):
R = params.gamma * R + rewards[i]
advantage = R - values[i]
value_loss = value_loss + 0.5 * advantage.pow(2)
TD = rewards[i] + params.gamma * values[i + 1].data - values[i].data
gae = gae * params.gamma * params.tau + TD
policy_loss = policy_loss - log_probs[i] * Variable(gae) - 0.01 * entropies[i]
optimizer.zero_grad()
(policy_loss + 0.5 * value_loss).backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 40)
optimizer.step()