# Efficient reward scheme for RL agent

I am pretty new to RL and I am trying to code a simple RL task with pytorch.

The goal/task is the following: The initial state is toto and the agent takes an action Δt: t_0+Δt=t_0+Δt=t_1.

If t_1 equals 450 or 475 then it gets a reward, else he does not get a reward.

I am training the agent with DQN algorithm on a NN ( with: 2 Linear layes: fist layer n_in=1 n_out 128 and second layer n_in=128 and n_out=5):

observation space(t_i) is 700 --> t_i∈[0,700[ti∈[0,700[
action space (Δt) is 5 --> (Δt∈[−50,−25,0,25,50]Δt∈[−50,−25,0,25,50])

epsilon_start=0.9#e-greedy threshold start value
epsilon_end=0.01#e-greedy threshold end value
epsilon_decay=200#e-greedy threshold decay learning_rate=0.001# NN
optimizer learning rate batch_size=64#Q-learning batch size

Unfortunately it does not seem to converge to the values t_i = 450 or 475. I doesn’t seem to care about getting a reward. How can I improve my code so that the agent learns what I am trying to teach him? I put my code below in case the explanations were not clear enough:

``````from gym import spaces

class RL_env(gym.Env):

def __init__(self):
super(RL_env, self).__init__()

n_actions_delta = 1 #delta_t
self.action_space = spaces.Discrete(5)

n_observations = 1 #time

self.observation_space = spaces.Discrete(700)

#initial time
self.time = 0

self.done = 0
self.reward = 0

def reset(self):
self.reward = 0
self.done = False
return self.time

def step(self,delta_t):
print('self time',self.time)
d_t = np.arange(-50,70,25)

self.time = (self.time + d_t[delta_t])%700
print('delta time',d_t[delta_t],'-->','self time',self.time)

if self.time == 475 or self.time == 450:
self.reward+=1

else:
self.reward += 0

info = {}
print('total reward',self.reward)
print('\n')
return self.time,self.reward, self.done, info

def render(self, mode='human', close=False):
print()
``````
``````import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
dtype = torch.float
device = torch.device("cpu")
import random
import math
import sys
if not sys.warnoptions:#igrnore warnings
import warnings
warnings.simplefilter("ignore")

#hyper parameters
epsilon_start=0.9
#e-greedy threshold start value
epsilon_end=0.01#e-greedy threshold end value
epsilon_decay=200#e-greedy threshold decay
learning_rate=0.001# NN optimizer learning rate
batch_size=64#Q-learning batch size

env = RL_env()

#use replay memory (-> to stabilize and improve our algorithm)for training: store transitions observed by agent,
#then reuse this data later
#sample from it randomly (batch built by transitions are decorrelated)
class ReplayMemory:#allowes the agent to learn from earlier memories (speed up learning and break undesirable temporal correlations)
def __init__(self, capacity):
self.capacity = capacity
self.memory = []
def push(self, transition):#saves transition
self.memory.append(transition)
if len(self.memory)>self.capacity:#if length of memory arra is larger than capacity (fixed)
del self.memory[0]#remove 0th element

def sample(self, batch_number):#samples randomly a transition to build batch
return random.sample(self.memory, batch_number)

def __len__(self):
return len(self.memory)

#Dqn NN (we want to maximize the discounted, cumulative reward)
#idea of Q-learning: we want to approximate with NN maximal Q-function (gives max return of action in given state)
#training update rule: use the fact that every Q-function for some policy obeys the Bellman equation
#difference between the two sides of the equality is known as the temporal difference error (we want to min -> Huber loss)
#calculate over batch of transitions sampled from the replay memory
class DqnNet(nn.Module):
def __init__(self):
super(DqnNet, self).__init__()

state_space = 1
action_space = env.action_space.n
num_hid = 128
self.fc1 = nn.Linear(state_space, num_hid)
self.fc2 = nn.Linear(num_hid, action_space)
self.gamma=0.5 #Q-learning discount factor (ensures that reward sum converges,
#makes actions from far future less important)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.sigmoid(self.fc2(x))
return x

#select action accordingly to epsilon greedy policy
#sometimes we use model for choosing action, other times sample uniformly
#probability of choosing a random action will start at epsilon_start and will decay (epsilon_decay) exponentially
#towards epsilon_end
steps_done=0
def predict_action(state):
global steps_done
sample=random.random()#random number
eps_threshold=epsilon_end+(epsilon_start-epsilon_end)*math.exp(-1.*steps_done/epsilon_decay)
steps_done += 1
if sample>eps_threshold:
x  = eps_threshold,model(Variable(state,).type(torch.FloatTensor)).data.max(0)[1].view(1, 1)
return x#chose action from model

else:
x = eps_threshold,torch.tensor([[random.randrange(env.action_space.n)]])
return x#choose random action uniformly

#wtih the update_policy function we perform a single step of the optimization
#first sample a batch, concatenate all the tensors into a single one, compute Q-value and max Q-value,
#and combine them into loss
def update_policy():
if len(memory)<batch_size:#we want to sample a batch of size 64
return
transitions = memory.sample(batch_size)#take random transition batch from experience replay memory
batch_state, batch_action, batch_next_state, batch_reward = zip(*transitions)#convert batch-array of Transitions
#to Transition of batch-arrays
#-->zip(*) takes iterables as arguments and return iterator

batch_state = Variable(torch.cat(batch_state))#concatenate given sequence tensors in the given dimension
batch_state = batch_state.resize(batch_size,1)
batch_action = Variable(torch.cat(batch_action))
batch_next_state = Variable(torch.cat(batch_next_state))
batch_next_state = batch_next_state.reshape(batch_size,1)
batch_reward = Variable(torch.cat(batch_reward))

#print('model batch state',model(Variable(batch_state[0])))
current_q_values = model(batch_state).gather(1, batch_action)#current Q-values estimated for all actions,
#compute Q, then select the columns of actions taken,
#these are the actions which would've been taken
#for each batch state according to policy_net
max_next_q_values = model(batch_next_state).detach().max(1)[0]#predicted Q-values for non-final-next-states
#(-> gives max Q)
expected_q_values = batch_reward + (model.gamma * max_next_q_values)

#loss is measured from error between current and newly expected Q values (Huber Loss)
loss = F.smooth_l1_loss(current_q_values, expected_q_values)

# backpropagation of loss to NN --> optimize model
loss.backward()
optimizer.step()
return loss, np.sum(expected_q_values.numpy())

def train(episodes):
scores = []
Losses = []
Bellman = []
Epsilon = []
Times = []
Deltas = []

for episode in range(episodes):
state=env.reset()#reset environment
print('\n')
print('episode',episode)

epsilon_action = predict_action(torch.FloatTensor([state]))

action = epsilon_action[1] #after each time step predict action

next_state, reward, done,info = env.step(action.item())#step through environment using chosen action

epsilon = epsilon_action[0]
Epsilon.append(epsilon)
print(reward,'reward')

state=next_state
Times.append(state)
scores.append(reward)

memory.push((torch.FloatTensor([state]),action,torch.FloatTensor([next_state]),
up = update_policy()#update_policy()#update policy

if up != None:
Losses.append(Variable(up[0]))
print('loss',Variable(up[0]))
Bellman.append(up[1])

#calculate score to determine when the environment has been solved
mean_score=np.mean(scores[-50:])#mean of score of last 50 episodes
#every 50th episode print score
if episode%50 == 0:
print('Episode {}\tScore: {}\tAverage score(last 50 episodes): {:.2f}'.format(episode,scores[-50:],mean_score))

#print('Losses',Losses)
Losses = torch.stack(Losses).numpy()
#print('Losses',Losses)
plt.plot(np.arange(len(Losses)),Losses)
plt.xlabel('Training iterations')
plt.ylabel('Loss')
plt.show()

Bellman = np.array(Bellman)
#print('Bellman',Bellman,'\n')
plt.plot(np.arange(len(Bellman)),Bellman)
plt.xlabel('Training iterations')
plt.ylabel('Bellman target')
plt.show()

#print('scores',scores)
plt.plot(np.arange(len(scores)),scores)
plt.xlabel('Training iterations')
plt.ylabel('Reward')
plt.show()

#print('epsilon',Epsilon)
plt.plot(np.arange(len(Epsilon)),Epsilon)
plt.xlabel('Training iterations')
plt.ylabel('Epsilon')
plt.show()

print('Times',Times[-25:])
print('Deltas',Deltas[-25:])

Times = np.array(Times)
print('Times',Times)
#plt.figure(figsize=(31,20))
plt.figure(figsize=(9,7))
plt.plot(np.arange(len(Times)),(np.array(Times)))
plt.xlabel('Training iterations')
plt.ylabel('t')
plt.show()

Times_1 = np.array(Times[-300:])
print('t',Times)
plt.figure(figsize=(9,7))
plt.plot(np.arange(len(Times_1)),(np.array(Times_1)))
plt.xlabel('Last 300 Training iterations')
plt.ylabel('t')
plt.ylim(0,1000)
plt.show()

model = DqnNet()#policy
memory = ReplayMemory(20000)