DQN converging at negative reward

I am trying to solve Tic Tac Toe with a DQN approach. Ultimately, I want the network to play against a random player and see if it can always achieve a draw at minimum. For the moment, however, I let the agent play all moves (I want to see, if the network architecture works as intended). Therefore I would expect, that the agent will learn to play in a a way, that always results in a win for one player.

What I observe instead is, that after some random draws or wins in the beginning, the network convergences to the point, where it always makes an illegal move, which results in a negative reward. My code looks as follows:

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random


class TicTacToe:
    def __init__(self):
        self.reset()
        self.observation_space = 27
        self.action_space = 9

    def reset(self):
        self.done = False
        self.state = [0]*10
        self.turn = random.choice([-1, 1])
        self.state[-1] = self.turn
        return self.state

    def is_winner(self):
        state = self.state

        # check horizontal
        if state[0] == state[1] == state[2] and state[0] != 0:
            return True
        if state[3] == state[4] == state[5] and state[3] != 0:
            return True
        if state[6] == state[7] == state[8] and state[6] != 0:
            return True

        # check vertical
        if state[0] == state[3] == state[6] and state[0] != 0:
            return True
        if state[1] == state[4] == state[7] and state[1] != 0:
            return True
        if state[2] == state[5] == state[8] and state[2] != 0:
            return True

        # check diagonal
        if state[0] == state[4] == state[8] and state[0] != 0:
            return True
        if state[2] == state[4] == state[6] and state[2] != 0:
            return True

        return False

    def step(self, action):
        reward = 0
        if self.state[action] != 0:     # illegal move
            reward = -10.0
            self.done = True
        self.state[action] = self.turn  # make move
        if self.is_winner():            # check for win
            reward = 1.0
            self.done = True
        else:
            self.turn = -self.turn
            self.state[-1] = self.turn  # update last state, which refers to the turn
        if self.state.count(0) == 0:    # check for draw
            reward = 0.5
            self.done = True
        return self.state, reward, self.done, None

    def render(self):
        state = self.state.copy()
        for i, symbol in enumerate(state):
            if symbol == 0:
                state[i] = ' '
            elif symbol == 1:
                state[i] = 'X'
            elif symbol == -1:
                state[i] = 'O'

        print('  ' + str(state[0]) + ' | ' + str(state[1]) + ' | ' + str(state[2]) + '  ')
        print('-------------')
        print('  ' + str(state[3]) + ' | ' + str(state[4]) + ' | ' + str(state[5]) + '  ')
        print('-------------')
        print('  ' + str(state[6]) + ' | ' + str(state[7]) + ' | ' + str(state[8]) + '  ')


class DQN(nn.Module):
    def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions):
        super(DQN, self).__init__()
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)    # in Tutorial steht *self.input_dims
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.out = nn.Linear(self.fc2_dims, self.n_actions)
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, observation):
        state = torch.Tensor(observation).to(self.device)
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        actions = self.out(x)

        return actions


class Agent(object):
    def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions,
                 max_mem_size=1_000_000, eps_end=0.01, eps_dec=0.996):
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_end = eps_end
        self.eps_dec = eps_dec
        self.lr = lr
        self.batch_size = batch_size
        self.n_actions = n_actions
        self.action_space = [i for i in range(n_actions)]
        self.mem_size = max_mem_size        # we need a memory to store experiences and randommly sample over them
        self.mem_counter = 0
        self.Q_eval = DQN(lr=self.lr, n_actions=self.n_actions, input_dims=input_dims,
                          fc1_dims=128, fc2_dims=128)
        self.Q_target = DQN(lr=self.lr, n_actions=self.n_actions, input_dims=input_dims,
                          fc1_dims=128, fc2_dims=128)
        self.state_memory = np.zeros((self.mem_size, *input_dims))
        self.new_state_memeory = np.zeros((self.mem_size, *input_dims))
        self.action_memory = np.zeros((self.mem_size, self.n_actions),
                                      dtype=np.uint8)
        self.reward_memory = np.zeros(self.mem_size)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.uint8)      # sequence of done flags

    def store_transition(self, state, action, reward, state_, terminal):
        index = self.mem_counter % self.mem_size
        self.state_memory[index] = state
        actions = np.zeros(self.n_actions)
        actions[action] = 1.0       # one hot encoding of actions
        self.action_memory[index] = actions
        self.reward_memory[index] = reward
        self.terminal_memory[index] = terminal
        self.new_state_memeory[index] = state_
        self.mem_counter += 1

    def choose_action(self, observation):
        rand = np.random.random()
        if rand < self.epsilon:
            action = np.random.choice(self.action_space)
        else:
            actions = self.Q_eval.forward(observation)
            action = torch.argmax(actions).item()
        return action

    def learn(self):
        if self.mem_counter > self.batch_size:
            self.Q_eval.optimizer.zero_grad()

            max_mem = self.mem_counter if self.mem_counter < self.mem_size \
                else self.mem_size
            batch = np.random.choice(max_mem, self.batch_size)

            state_batch = self.state_memory[batch]
            action_batch = self.action_memory[batch]
            action_values = np.array(self.action_space, dtype=np.int32)
            action_indices = np.dot(action_batch, action_values)
            reward_batch = self.reward_memory[batch]
            terminal_batch = self.terminal_memory[batch]
            new_state_batch = self.new_state_memeory[batch]

            reward_batch = torch.Tensor(reward_batch).to(self.Q_eval.device)
            terminal_batch = torch.Tensor(terminal_batch).to(self.Q_eval.device)

            q_eval = self.Q_eval.forward(state_batch).to(self.Q_eval.device)
            #q_target = self.Q_target.forward(state_batch).to(self.Q_target.device)  # alternative to q_eval.clone()
            q_target = q_eval.clone()
            q_next = self.Q_eval.forward(new_state_batch).to(self.Q_eval.device)

            batch_index = np.arange(self.batch_size, dtype=np.int32)
            q_target[batch_index, action_indices] = reward_batch + \
                self.gamma * torch.max(q_next, dim=1)[0] * terminal_batch

            self.epsilon = self.epsilon * self.eps_dec if self.epsilon > \
                self.eps_end else self.eps_end

            loss = F.smooth_l1_loss(q_target, q_eval).to(self.Q_eval.device)
            loss.backward()
            self.Q_eval.optimizer.step()


def train():
    env = TicTacToe()
    brain = Agent(gamma=0.9, epsilon=1.0, batch_size=64, n_actions=9,
                  input_dims=[10], lr=0.001)
    scores = []
    eps_history = []
    n_games = 10000

    for i in range(n_games):
        if i % 100 == 0 and i > 0:
            avg_score = np.mean(scores[max(0, i-100):(i+1)])
            print('episode', i, 'average score %.3f' % avg_score)
        score = 0
        eps_history.append(brain.epsilon)
        observation = env.reset()
        done = False
        while not done:
            action = brain.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            score += reward
            brain.store_transition(observation, action, reward, observation_, done)
            brain.learn()
            observation = observation_

        scores.append(score)

train()

I tried to change the number of weights as well as different learning rates / epsilon decay rates and gamma factors. Unfortunately nothing has worked so far. I assume, that the problem arises from a logical error in my learn() function.

Two side notes: I already checked, that the weights are actually changing after each epoch. Moreover, I am not using a target network so far.

alright I have found an error - I think my update function q_target[batch_index, action_indices] = reward_batch + \ self.gamma * torch.max(q_next, dim=1)[0] * terminal_batch is not correct, as the terminal_batch values are zero, whenever the game is not done, while it should be zero only at the terminal state, when the game is done. I fixed this, and now it seems to work better. However, I still couldn’t get the average reward over 0.4. I also changed the reward for an illegal move to 0. That means, after training the network it still only manages to make no illegal move in about 40% of the games. Is this a normal result for about 10.000 games played?

Not much response so far, but I won’t give up hope.

I encountered another problem. I wanted to test the performance after training the network and unfortunately, i always get a reward of zero.

The validation loop is implemented in the train() function and looks like follows:

def train(n_games, lr):
    env = TicTacToe()
    brain = Agent(gamma=0.99, epsilon=1.0, batch_size=512, n_actions=9,
                  input_dims=[10], lr=lr)
    scores = []
    test_scores = []
    eps_history = []

    for i in range(n_games):
        if i % 100 == 0 and i > 0:
            avg_score = np.mean(scores[max(0, i-100):(i+1)])
            print('episode:', i, 'average score %.3f:' % avg_score, 'epsilon:', brain.epsilon)
        score = 0
        eps_history.append(brain.epsilon)
        observation = env.reset()
        done = False
        while not done:
            action = brain.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            score += reward
            brain.store_transition(observation, action, reward, observation_, done)
            brain.learn()
            observation = observation_

        scores.append(score)

    # save model
    torch.save(brain.Q_eval.state_dict(), 'parameters.pth')

    for i in range(100):
        brain.epsilon = 0.
        score = 0
        observation = env.reset()
        done = False
        while not done:
            action = brain.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            score += reward
            observation = observation_
        test_scores.append(score)

    print(np.mean(test_scores))

So while the training ended up with an average score of about 0.3, the test with the trained network resulted in a score of 0. This changes however, when I add the two lines:

brain.store_transition(observation, action, reward, observation_, done)
brain.learn()

That way, I get similar results as during the end of the training loop. But that way I train the network further, which is not what I want… Any advice why this happens?