DQN algorithm in pytorch not converging

Debjit_Chowdhury · August 30, 2020, 9:47pm

I am new to deep reinforcement learning and have implemented the algorithm on my own but the value is not converging could anyone take a look and tell me what is wrong with my algorithm and can i do to make it better Here is the code:

import gym
import torch
import numpy as np
import torch 
import random
from collections import deque
from itertools import count

class ReplayBuffer:
    def __init__(self):
        self.buffer=deque(maxlen=50000)
    def push(self,state,action,reward,next_state,done):
        if(len(self.buffer)<=1000):
            self.buffer.append((state,action,reward,next_state,done))
    def sample(self, batch_size: int, continuous: bool = True):
        if batch_size > len(self.buffer):
            batch_size = len(self.buffer)
        if continuous:
            rand = random.randint(0, len(self.buffer) - batch_size)
            return [self.buffer[i] for i in range(rand, rand + batch_size)]
        else:
            indexes = np.random.choice(np.arange(len(self.buffer)), size=batch_size, replace=False)
            return [self.buffer[i] for i in indexes]
class NNetwork(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.l1=torch.nn.Linear(4,128)
        self.l2=torch.nn.Linear(128,128)
        self.l3=torch.nn.Linear(128,2)
        
        self.optimizer=torch.optim.Adam(params=self.parameters(),lr=0.001)
        self.criterion=torch.nn.MSELoss()
    def forward(self,x):
        al1=torch.nn.ReLU()(self.l1(x))
        al2=torch.nn.ReLU()(self.l2(al1))
        al3=self.l3(al2)
        return al3
class Agent():
    def __init__(self):
        
        self.env=gym.make('CartPole-v0')
        self.mem=ReplayBuffer()
        self.q_local=NNetwork()
        self.q_target=NNetwork()
        self.q_target.load_state_dict(self.q_local.state_dict())
        self.epsilon=1.0
        self.e_decay=0.0995
        self.e_min=0.1
        self.update=4
        self.score=0
        self.gamma=0.99

    def predict(self,state):
        if (np.random.randn()<self.epsilon):
            return random.randint(0,1)
        else:
            index=self.q_local.forward(torch.Tensor(state).unsqueeze(0))
            return torch.argmax(index,dim=1).item()
    
    def step(self):
        state=self.env.reset()
        done=False
        i=0
        while not done:
            action=self.predict(state)
            n_state,reward,done,_=self.env.step(action)
            self.mem.push(state,action,reward,n_state,done)
            self.score+=reward
            self.learn()
            state=n_state
            i+=1
            if(i%10==0):
                if(self.epsilon>self.e_min):
                    self.epsilon=self.epsilon-self.e_decay
                else:
                    self.epsilon=self.e_min
                self.q_target.load_state_dict(self.q_local.state_dict())
          
        print(self.score)
        self.score=0
    def learn(self):
        if(len(self.mem.buffer)%32==0):
            return
        batch =self.mem.sample(32)
        state,action,reward,n_state,done= zip(*batch)
        state=torch.Tensor(state)
        action=torch.Tensor(action).unsqueeze(1)
        n_state=torch.Tensor(n_state)
        reward=torch.Tensor(reward).unsqueeze(1)
        done=torch.Tensor(done).unsqueeze(1)

        self.q_local.optimizer.zero_grad()
        
        q_N=self.q_local.forward(state).gather(1,action.long())
        q_t=self.q_target.forward(n_state)
        y=reward+(1-done)*self.gamma*torch.max(q_t,dim=1,keepdim=True)[0]
        
        loss=self.q_local.criterion(q_N,y)
        loss.backward()
        self.q_local.optimizer.step()
agent=Agent()
for t in count():
    print("EP ",t)
    agent.step()

Well I am being ale to produce few scores but it is not converging

liuruiqi1107 · August 30, 2020, 10:29pm

Hi, I think the reason why your algorithm is not converging is that you are calculating your gradient both when you calculate q_N and q_t, and that will cause gradient incorrectness. You can use q_t.detach() to remove q_t from the calculate chain. Hope it works!