Hey there,
I trained a NN using PPO. My network gives me the action I should do for a given state and the estimated value for that state and action. I trained the network with normalized rewards:
rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)
Questions:
-
In practice (when using the NN) I just get the normalized estimated value - is there any way to get the true estimated value? - I do not have rewards.mean() etc. thus I cannot just calculate it as expalined here.
-
What is the estimated value of my value function ? Does it simply rate the actual state and action or does it evaluate the given state and gives me a hint on the final score?
-
What I actually want is a network that predicts me the final score for a given state. Can I use the value function to achieve this? Or should I use something different -> train a seperate NN?
My code is at:
A snippet of my network is:
#Actor Model:
class ActorModel(nn.Module):
def __init__(self, state_dim, action_dim, n_latent_var):
super(ActorModel, self).__init__()
self.a_dim = action_dim
self.ac = nn.Linear(state_dim, n_latent_var)
self.ac_prelu= nn.PReLU()
self.ac1 = nn.Linear(n_latent_var, n_latent_var)
self.ac1_prelu= nn.PReLU()
# Actor layers:
self.a1 = nn.Linear(n_latent_var+action_dim, action_dim)
# Critic layers:
self.c1 = nn.Linear(n_latent_var, n_latent_var)
self.c1_prelu= nn.PReLU()
self.c2 = nn.Linear(n_latent_var, 1)
def forward(self, input):
# For 4 players each 15 cards on hand:
# input=on_table(60)+ on_hand(60)+ played(60)+ play_options(60)+ add_states(15)
# add_states = color free (4)+ would win (1) = 5 for each player
#input.shape = 15*4*4=240+3*5 (add_states) = 255
#Actor and Critic:
ac = self.ac(input)
ac = self.ac_prelu(ac)
ac = self.ac1(ac)
ac = self.ac1_prelu(ac)
# Get Actor Result:
if len(input.shape)==1:
options = input[self.a_dim*3:self.a_dim*4]
actor_out =torch.cat( [ac, options], 0)
else:
options = input[:, self.a_dim*3:self.a_dim*4]
actor_out = torch.cat( [ac, options], 1)
actor_out = self.a1(actor_out)
actor_out = actor_out.softmax(dim=-1)
# Get Critic Result:
critic = self.c1(ac)
critic = self.c1_prelu(critic)
critic = self.c2(critic)
return actor_out, critic
class ActorCritic(nn.Module):
def __init__(self, state_dim, action_dim, n_latent_var):
super(ActorCritic, self).__init__()
self.a_dim = action_dim
# actor critic
self.actor_critic = ActorModel(state_dim, action_dim, n_latent_var)
def act(self, state, memory):
if type(state) is np.ndarray:
state = torch.from_numpy(state).float()
action_probs, _ = self.actor_critic(state)
# here make a filter for only possible actions!
#action_probs = action_probs *state[self.a_dim*3:self.a_dim*4]
dist = Categorical(action_probs)
action = dist.sample()# -> gets the lowest non 0 value?!
if memory is not None:
#necessary to convet all to numpy otherwise deepcopy not possible!
memory.states.append(state.data.numpy())
memory.actions.append(int(action.data.numpy()))
memory.logprobs.append(float(dist.log_prob(action).data.numpy()))
return action.item()
def evaluate(self, state, action):
action_probs, state_value = self.actor_critic(state)
dist = Categorical(action_probs)
action_logprobs = dist.log_prob(action)
dist_entropy = dist.entropy()
return action_logprobs, torch.squeeze(state_value), dist_entropy