Implementing RNN and LSTM into DQN Pytorch code

I have some troubles finding some example on the great www to how i implement a recurrent neural network with LSTM layer into my current Deep q-network in Pytorch so it become a DRQN… Bear with me i am just getting started… Futhermore, I am NOT working with images processing, thereby CNN so do not worry about this. My states are purely temperatures values.

Here is my code that i am currently train my DQN with:

# AI for Self Driving Car

#Settings to adjust inorder to get a better algorithm
# reward policy
	# Less punishment to increase distance from goal
	# if car is more than 10s to find goal then punish
# more hidden layers
# more hidden neurons
# gamma
# optimizer

# Importing the libraries

import numpy as np
import random # random samples from different batches (experience replay)
import os # For loading and saving brain
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim # for using stochastic gradient descent
import torch.autograd as autograd # Conversion from tensor (advanced arrays) to avoid all that contains a gradient
# We want to put the tensor into a varaible taht will also contain a
# gradient and to this we need:
from torch.autograd import Variable
# to convert this tensor into a variable containing the tensor and the gradient


# Creating the architecture of the Neural Network
class Network(nn.Module): #inherinting from nn.Module
    
    #Self - refers to the object that will be created from this class
    #     - self here to specify that we're referring to the object
    def __init__(self, input_size, nb_action): #[self,input neuroner, output neuroner]
        super(Network, self).__init__() #inorder to use modules in torch.nn
        # Input and output neurons
        self.input_size = input_size
        self.nb_action = nb_action
        # Full connection between different layers of NN
        # In this example its one input layer, one hidden layer and one output layer
        # Using self here to specify that fc1 is a variable of my object
        self.fc1 = nn.Linear(input_size, 40)
        self.fc2 = nn.Linear(40, 30)
		#Example of adding a hiddenlayer
        # self.fcX = nn.Linear(30,30)
        self.fc3 = nn.Linear(30, nb_action) # 30 neurons in hidden layer
    
    # For function that will activate neurons and perform forward propagation
    def forward(self, state):
        # rectifier function
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        q_values = self.fc3(x)
        return q_values

# Implementing Experience Replay
# We know that RL is based on MDP
# So going from one state(s_t) to the next state(s_t+1)
# We gonna put 100 transition between state into what we call the memory
# So we can use the distribution of experience to make a decision
class ReplayMemory(object):
    
    def __init__(self, capacity):
        self.capacity = capacity #100 transitions
        self.memory = [] #memory to save transitions
    
    # pushing transitions into memory with append
    #event=transition
    def push(self, event):
        self.memory.append(event)
        if len(self.memory) > self.capacity: #memory only contain 100 events
            del self.memory[0] #delete first transition from memory if there is more that 100
    
    # taking random sample
    def sample(self, batch_size):
        #Creating variable that will contain the samples of memory
        #zip =reshape function if list = ((1,2,3),(4,5,6)) zip(*list)= (1,4),(2,5),(3,6)
        #                (state,action,reward),(state,action,reward)  
        samples = zip(*random.sample(self.memory, batch_size))
        #This is to be able to differentiate with respect to a tensor
        #and this will then contain the tensor and gradient
        #so for state,action and reward we will store the seperately into some
        #bytes which each one will get a gradient
        #so that eventually we'll be able to differentiate each one of them
        return map(lambda x: Variable(torch.cat(x, 0)), samples)

# Implementing Deep Q Learning

class Dqn():
    
    def __init__(self, input_size, nb_action, gamma, lrate, T):
        self.gamma = gamma #self.gamma gets assigned to input argument
        self.T = T
        # Sliding window of the evolving mean of the last 100 events/transitions
        self.reward_window = []
        #Creating network with network class
        self.model = Network(input_size, nb_action)
        #creating memory with memory class
        #We gonna take 100000 samples into memory and then we will sample from this memory to 
        #to get a snakk number of random transitions
        self.memory = ReplayMemory(100000)
        #creating optimizer (stochastic gradient descent)
        self.optimizer = optim.Adam(self.model.parameters(), lr = lrate) #learning rate
        #input vector which is batch of input observations
        #by unsqeeze we create a fake dimension to this is
        #what the network expect for its inputs
        #have to be the first dimension of the last_state
        self.last_state = torch.Tensor(input_size).unsqueeze(0)
        #Inilizing
        self.last_action = 0
        self.last_reward = 0
    
    def select_action(self, state):
        #Q value depends on state
        #Temperature parameter T will be a positive number and the closer
        #it is to ze the less sure the NN will when taking an action
        #forexample
        #softmax((1,2,3))={0.04,0.11,0.85} ==> softmax((1,2,3)*3)={0,0.02,0.98} 
        #to deactivate brain then set T=0, thereby it is full random
        probs = F.softmax((self.model(Variable(state, volatile = True))*self.T),dim=1) # T=100
        #create a random draw from the probability distribution created from softmax
        action = probs.multinomial()
        print(probs.multinomial())
        return action.data[0,0]

    # See section 5.3 in AI handbook
    def learn(self, batch_state, batch_next_state, batch_reward, batch_action):
        outputs = self.model(batch_state).gather(1, batch_action.unsqueeze(1)).squeeze(1)
        #next input for target see page 7 in attached AI handbook
        next_outputs = self.model(batch_next_state).detach().max(1)[0]
        target = self.gamma*next_outputs + batch_reward
        #Using hubble loss inorder to obtain loss
        td_loss = F.smooth_l1_loss(outputs, target)
        #using  lass loss/error to perform stochastic gradient descent and update weights 
        self.optimizer.zero_grad() #reintialize the optimizer at each iteration of the loop
        #This line of code that backward propagates the error into the NN
        #td_loss.backward(retain_variables = True) #userwarning
        td_loss.backward(retain_graph = True)
		#And this line of code uses the optimizer to update the weights
        self.optimizer.step()
    
    def update(self, reward, new_signal):
        #Updated one transition and we have dated the last element of the transition
        #which is the new state
        new_state = torch.Tensor(new_signal).float().unsqueeze(0)
        self.memory.push((self.last_state, new_state, torch.LongTensor([int(self.last_action)]), torch.Tensor([self.last_reward])))
        #After ending in a state its time to play a action
        action = self.select_action(new_state)
        if len(self.memory.memory) > 100:
            batch_state, batch_next_state, batch_action, batch_reward = self.memory.sample(100)
            self.learn(batch_state, batch_next_state, batch_reward, batch_action)
        self.last_action = action
        self.last_state = new_state
        self.last_reward = reward
        self.reward_window.append(reward)
        if len(self.reward_window) > 1000:
            del self.reward_window[0]
        return action
    
    def score(self):
        return sum(self.reward_window)/(len(self.reward_window)+1.)
    
    def save(self):
        torch.save({'state_dict': self.model.state_dict(),
                    'optimizer' : self.optimizer.state_dict(),
                   }, 'last_brain.pth')
    
    def load(self):
        if os.path.isfile('last_brain.pth'):
            print("=> loading checkpoint... ")
            checkpoint = torch.load('last_brain.pth')
            self.model.load_state_dict(checkpoint['state_dict'])
            self.optimizer.load_state_dict(checkpoint['optimizer'])
            print("done !")
        else:
            print("no checkpoint found...")

I hope there is someone to help me out there!

2 Likes