# How to in enhance A3C entropy?

I’m trying to implement this A3C code in my custom environment, and I have a basic understanding of the algorithm. The algorithm worked, but it did not give me a good performance. I looked into multiple implementations, and each one seemed different to me, like this one, for example, now the algorithm that I write is as follows:

• A3C
``````class ActorCritics(nn.Module):
def __init__(self,input,n_actions,env,gamma=0.99):
super(ActorCritics,self).__init__()
self.gamma=gamma
self.env=env
self.n_actions=n_actions

self.pi1=nn.Linear(input,128)
self.v1=nn.Linear(input,128)

self.pi2=nn.Linear(128,64)
self.v2=nn.Linear(128,64)

self.pi3=nn.Linear(64,32)
self.v3=nn.Linear(64,32)

self.pi4=nn.Linear(32,16)
self.v4=nn.Linear(32,16)

self.pi5=nn.Linear(16,8)
self.v5=nn.Linear(16,8)

self.pi6=nn.Linear(8,4)
self.v6=nn.Linear(8,4)

self.pi7=nn.Linear(4,2)
self.v7=nn.Linear(4,2)

self.pi=nn.Linear(2,n_actions)
self.v=nn.Linear(2,1)

self.rewards=[]
self.actions=[]
self.states=[]

#this function takes the values of the state,actions,and reward and append to the memory
def remember(self,state,action,reward):
self.actions.append(action)
self.rewards.append(reward)
self.states.append(state)

#this function reset the memory each time we are calling the learning function
def clear_memory(self):
self.states=[]
self.actions=[]
self.rewards=[]

def forward(self,state):
pi1=F.relu(self.pi1(state))
v1=F.relu(self.v1(state))

pi2=F.relu(self.pi2(pi1))
v2=F.relu(self.v2(v1))

pi3=F.relu(self.pi3(pi2))
v3=F.relu(self.v3(v2))

pi4=F.relu(self.pi4(pi3))
v4=F.relu(self.v4(v3))

pi5=F.relu(self.pi5(pi4))
v5=F.relu(self.v5(v4))

pi6=F.relu(self.pi6(pi5))
v6=F.relu(self.v6(v5))

pi7=F.relu(self.pi7(pi6))
v7=F.relu(self.v7(v6))

pi=self.pi(pi7)
v=self.v(v7)
return pi,v

def calc_returns(self,done,vstates):
p,v=self.forward(vstates)

R=v[-1]*(1-int(done))
batch_return=[]

for reward in self.rewards[::-1]:
R=reward+self.gamma*R
batch_return.append(R)
batch_return.reverse()
batch_return=T.tensor(batch_return,dtype=float)
return batch_return

def calc_loss(self,done):
#states=T.tensor(self.states,dtype=T.float)
list_state=[]
if len(self.states)>1:
for lstate in self.states:
soruce,end=self.env.state_dec(lstate)
state_v=self.env.state_to_vector(soruce,end)
list_state.append(state_v)
states=T.tensor(list_state)

else:
soruce,end=self.env.state_dec(self.states[0])
state_v=self.env.state_to_vector(soruce,end)
list_state.append(state_v)
states=T.tensor([list_state])
actions=T.tensor(self.actions,dtype=T.float)

returns=self.calc_returns(done,states)

p,values=self.forward(states)
values=values.squeeze()

critic_loss=(returns-values)**2

probs=T.softmax(p,dim=1)
dist=Categorical(probs)
log_probs=dist.log_prob(actions)
actor_loss=-log_probs*(returns-values)
total_loss=(critic_loss+actor_loss).mean()

def choose_action(self,node,action):
state_vector=self.env.state_to_vector(node,action)
state=T.tensor([state_vector],dtype=T.float)
pi,v=self.forward(state)
probs=T.softmax(pi,dim=1)
dist=Categorical(probs)
action=dist.sample().numpy()[0]#take a sample from the categorical dist from 1-22
return action

``````
• AGENT
``````class Agent(mp.Process):
def __init__(self,global_actor_critic,optimizer,input,n_actions
,gamma,lr,worker_name,global_episode_index,env,gather,games,T_max,res_queue,loss_queue):
super(Agent,self).__init__()
self.local_actor_critic=ActorCritics(input,n_actions,env,gamma)
self.global_actor_critic=global_actor_critic
self.worker_name='w%02i'%worker_name
self.episode_idx = global_episode_index
self.env=env
self.gather_eps=gather
self.optimizer=optimizer
self.N_games=games
self.T_max=T_max
self.res_queue=res_queue
self.loss_queue=loss_queue
self.dict_list={'number_of_episodes':[],'score':[],'loss':[]}

def list_remember(self,d_episode,d_score,d_loss):
self.dict_list['number_of_episodes'].append(d_episode)#,d_score,d_loss.item()
self.dict_list['score'].append(d_score)
self.dict_list['loss'].append(d_loss.item())

def run(self):
t_step=1
max_itr=1000
#self.episode_idx is a gloabl parametar from MP class and we need to get the value from it
while self.episode_idx.value < self.N_games:

itr=0
done=False
observation=self.env.reset()
score=0
penalties=0

self.local_actor_critic.clear_memory()
while not done:
soruce,end=self.env.state_dec(observation)
action=self.local_actor_critic.choose_action(soruce,end)
observation_,reward,done=self.env.step(observation,action)
if reward == -1000:
penalties+=1

score += reward
self.local_actor_critic.remember(observation,action,reward)

if t_step% self.T_max==0 or done:
loss=self.local_actor_critic.calc_loss(done)
loss.backward()
#set the current parameters for the workers into the gloabl parameters
for local_param,global_param in zip(self.local_actor_critic.parameters(),
self.global_actor_critic.parameters()):
self.optimizer.step()
self.local_actor_critic.clear_memory()

t_step+=1

itr+=1
observation=observation_
print(self.worker_name,'episode',self.episode_idx.value,'reward',score,'penalties',penalties,'goal',done,
'itr_to_done',itr,'loss',loss.item(),'\n',flush=True)

self.list_remember(self.episode_idx.value,score,loss)

self.gather_eps.append_data(self.episode_idx.value,score,loss.item())
self.res_queue.put(score)
self.loss_queue.put(loss.item())

with self.episode_idx.get_lock():
self.episode_idx.value+=1

self.res_queue.put(None)
self.loss_queue.put(None)

``````

now my question did I implement the algorithm right and is the entropy of my code is also written right, and how can I enhance the code?