DQN is not learning

I have written code for Deep Q learning and tried on the bunch of environment but on each of them, my algorithms failed to perform correctly. I printed the q values for few states and I found all of the action has learned almost same values. Not sure why this is happening.

mem = Memory(memsize=MEMORY_SIZE)
main_model = Network(image_input_size=INPUT_IMAGE_DIM,out_size=OUTPUT_SIZE).float().cuda()
target_model = Network(image_input_size=INPUT_IMAGE_DIM,out_size=OUTPUT_SIZE).float().cuda()

target_model.load_state_dict(main_model.state_dict())
criterion = nn.SmoothL1Loss()
optimizer = torch.optim.Adam(main_model.parameters())

# filling memory with transitions
prev_state = env.reset()
processed_prev_state = preprocess_image(prev_state)

for i in range(0,MAX_STEPS):
    action = np.random.randint(0,4)
    next_state,reward,done = env.step(action)
    processed_next_state = preprocess_image(next_state)
    
    mem.add_sample((processed_prev_state,action,reward,processed_next_state))
    
    if done == True:
        print('BRO GAME OVER !')
    prev_state = next_state
    processed_prev_state = processed_next_state

print('Populated %d Samples'%(len(mem.memory)))

# Algorithm Starts
total_steps = 0
epsilon = INITIAL_EPSILON
loss_stat = []
total_reward_stat = []

for episode in range(0,TOTAL_EPISODES):
    prev_state = env.reset()
    processed_prev_state = preprocess_image(prev_state)
    step_count = 0
    total_reward = 0
    
    while step_count < MAX_STEPS:
        
        step_count +=1
        total_steps +=1
        
        if np.random.rand() <= epsilon:
            action = np.random.randint(0,4)
        else:
            with torch.no_grad():
                torch_x = torch.from_numpy(processed_prev_state).float().cuda()
                
                model_out = main_model.forward(torch_x,bsize=1)
                action = int(torch.argmax(model_out.view(OUTPUT_SIZE),dim=0))
        
        
        next_state, reward, game_over = env.step(action)
        processed_next_state = preprocess_image(next_state)
        total_reward += reward
        
        mem.add_sample((processed_prev_state,action,reward,processed_next_state))
        
        if game_over == True:
            print('BRO GAME OVER')
        
        prev_state = next_state
        processed_prev_state = processed_next_state
        
        if (total_steps % FREEZE_INTERVAL) == 0:
            target_model.load_state_dict(main_model.state_dict())
        
        batch = mem.get_batch(size=BATCH_SIZE)
        current_states = []
        next_states = []
        acts = []
        rewards = []
        
        for element in batch:
            current_states.append(element[0])
            acts.append(element[1])
            rewards.append(element[2])
            next_states.append(element[3])
        current_states = np.array(current_states)
        next_states = np.array(next_states)
        rewards = np.array(rewards)
        
        Q_next = target_model.forward(torch.from_numpy(next_states).float().cuda(),bsize=BATCH_SIZE)
        Q_s = main_model.forward(torch.from_numpy(current_states).float().cuda(),bsize=BATCH_SIZE)
        target_out = (torch.from_numpy(rewards).float().cuda() + (GAMMA * torch.max(Q_next.detach(),dim=1)[0]))
        target_values = torch.zeros(Q_s.size()).cuda()
        target_values.copy_(Q_s.detach())
        
        for i in range(0,BATCH_SIZE):
            target_values[i][acts[i]] = target_out[i].detach()
        
        loss = criterion(Q_s,target_values)
        
        # saving performance stat
        loss_stat.append(loss.item())
        
        # make previous grad zero
        optimizer.zero_grad()
        
        # backpropogate
        loss.backward()
        
        # update params
        optimizer.step()
    
    # save performance stat
    total_reward_stat.append(total_reward/MAX_STEPS)
    
    if epsilon > FINAL_EPSILON:
        epsilon -= (INITIAL_EPSILON - FINAL_EPSILON)/TOTAL_EPISODES
    
    if (episode + 1)% PERFORMANCE_SAVE_INTERVAL == 0:
        perf = {}
        perf['loss'] = loss_stat
        perf['total_reward'] = total_reward_stat
        save_obj(name='MDP_ENV',obj=perf)
    
    #print('Completed episode : ',episode+1,' Epsilon : ',epsilon,' Reward : ',total_reward/MAX_STEPS)

Do the values of the weights change after each .step?

Yes. I tried to print the sum of the weights of the fc2 layer after each .step and every time I got the different sum.

# update params
        optimizer.step()

print(torch.sum(main_model.fc1.weight.data).item())

gives

14.266899108886719
16.328222274780273
16.12299346923828
15.318760871887207
14.778365135192871
14.173827171325684
13.708841323852539
13.355201721191406
12.91303825378418
12.379934310913086
11.765584945678711
11.129434585571289

Hi Mayank, did you get any clue about what happened? I almost got the almost same q-value for any states. Thanks!