I have written code for Deep Q learning and tried on the bunch of environment but on each of them, my algorithms failed to perform correctly. I printed the q values for few states and I found all of the action has learned almost same values. Not sure why this is happening.
mem = Memory(memsize=MEMORY_SIZE)
main_model = Network(image_input_size=INPUT_IMAGE_DIM,out_size=OUTPUT_SIZE).float().cuda()
target_model = Network(image_input_size=INPUT_IMAGE_DIM,out_size=OUTPUT_SIZE).float().cuda()
target_model.load_state_dict(main_model.state_dict())
criterion = nn.SmoothL1Loss()
optimizer = torch.optim.Adam(main_model.parameters())
# filling memory with transitions
prev_state = env.reset()
processed_prev_state = preprocess_image(prev_state)
for i in range(0,MAX_STEPS):
action = np.random.randint(0,4)
next_state,reward,done = env.step(action)
processed_next_state = preprocess_image(next_state)
mem.add_sample((processed_prev_state,action,reward,processed_next_state))
if done == True:
print('BRO GAME OVER !')
prev_state = next_state
processed_prev_state = processed_next_state
print('Populated %d Samples'%(len(mem.memory)))
# Algorithm Starts
total_steps = 0
epsilon = INITIAL_EPSILON
loss_stat = []
total_reward_stat = []
for episode in range(0,TOTAL_EPISODES):
prev_state = env.reset()
processed_prev_state = preprocess_image(prev_state)
step_count = 0
total_reward = 0
while step_count < MAX_STEPS:
step_count +=1
total_steps +=1
if np.random.rand() <= epsilon:
action = np.random.randint(0,4)
else:
with torch.no_grad():
torch_x = torch.from_numpy(processed_prev_state).float().cuda()
model_out = main_model.forward(torch_x,bsize=1)
action = int(torch.argmax(model_out.view(OUTPUT_SIZE),dim=0))
next_state, reward, game_over = env.step(action)
processed_next_state = preprocess_image(next_state)
total_reward += reward
mem.add_sample((processed_prev_state,action,reward,processed_next_state))
if game_over == True:
print('BRO GAME OVER')
prev_state = next_state
processed_prev_state = processed_next_state
if (total_steps % FREEZE_INTERVAL) == 0:
target_model.load_state_dict(main_model.state_dict())
batch = mem.get_batch(size=BATCH_SIZE)
current_states = []
next_states = []
acts = []
rewards = []
for element in batch:
current_states.append(element[0])
acts.append(element[1])
rewards.append(element[2])
next_states.append(element[3])
current_states = np.array(current_states)
next_states = np.array(next_states)
rewards = np.array(rewards)
Q_next = target_model.forward(torch.from_numpy(next_states).float().cuda(),bsize=BATCH_SIZE)
Q_s = main_model.forward(torch.from_numpy(current_states).float().cuda(),bsize=BATCH_SIZE)
target_out = (torch.from_numpy(rewards).float().cuda() + (GAMMA * torch.max(Q_next.detach(),dim=1)[0]))
target_values = torch.zeros(Q_s.size()).cuda()
target_values.copy_(Q_s.detach())
for i in range(0,BATCH_SIZE):
target_values[i][acts[i]] = target_out[i].detach()
loss = criterion(Q_s,target_values)
# saving performance stat
loss_stat.append(loss.item())
# make previous grad zero
optimizer.zero_grad()
# backpropogate
loss.backward()
# update params
optimizer.step()
# save performance stat
total_reward_stat.append(total_reward/MAX_STEPS)
if epsilon > FINAL_EPSILON:
epsilon -= (INITIAL_EPSILON - FINAL_EPSILON)/TOTAL_EPISODES
if (episode + 1)% PERFORMANCE_SAVE_INTERVAL == 0:
perf = {}
perf['loss'] = loss_stat
perf['total_reward'] = total_reward_stat
save_obj(name='MDP_ENV',obj=perf)
#print('Completed episode : ',episode+1,' Epsilon : ',epsilon,' Reward : ',total_reward/MAX_STEPS)