CNN not training

Preston_Willis · August 24, 2019, 4:49pm

I am completely new to CNN’s, and I do not quite know how to design or use them efficiently. That being said, I am attempting to build a CNN that learns to play Pac-man with reinforcement learning. I have trained it for about 3 hours and have seen little to no improvement. My observation space is 3 channels * 15 * 19, and there are 5 actions. Here is my code, I am open to any and all suggestions. Thanks for all your help.

from minipacman import MiniPacman as pac
from torch import nn
import torch
import random
import torch.optim as optimal
from torch.autograd import Variable
import matplotlib.pyplot as plt
import numpy as np
import keyboard


loss_fn = nn.MSELoss()
epsilon = 1
env = pac("regular", 1000)
time = 0
action = random.randint(0, 4)
q = np.zeros(3)
alpha = 0.01
gamma = 0.9
tick = 0
decay = 0.9999


class Value_Approximator (nn.Module):
    def __init__(self):
        super(Value_Approximator, self).__init__()
        # Convolution 1
        self.cnn1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=5, stride=1, padding=2)
        self.relu1 = nn.ReLU()

        # Max pool 1
        self.maxpool1 = nn.MaxPool2d(kernel_size=2)

        # Convolution 2
        self.cnn2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=5, stride=1, padding=2)
        self.relu2 = nn.ReLU()

        # Max pool 2
        self.maxpool2 = nn.MaxPool2d(kernel_size=2)

        # Fully connected 1 (readout)
        self.fc1 = nn.Linear(384, 5)

    def forward(self, x):
        # Convolution 1
        out = self.cnn1(x)
        out = self.relu1(out)

        # Max pool 1
        out = self.maxpool1(out)

        # Convolution 2
        out = self.cnn2(out)
        out = self.relu2(out)

        # Max pool 2
        out = self.maxpool2(out)

        # Resize
        # Original size: (100, 32, 7, 7)
        # out.size(0): 100
        # New out size: (100, 32*7*7)
        out = out.view(out.size(0), -1)

        # Linear function (readout)
        out = self.fc1(out)

        return out

approx = Value_Approximator()
optimizer = optimal.SGD(approx.parameters(), lr=alpha)


while time < 50000:
    print("Time: "+str(time))
    print("Epsilon: "+str(epsilon))
    print()
    time += 1
    state = env.reset()
    tick = 0

    epsilon *= decay

    if epsilon < 0.1:
        epsilon = 0.1

    while True:
        tick += 1
        state = np.expand_dims(state, 1)
        state = state.reshape(1, 3, 15, 19)
        q = approx.forward(torch.from_numpy(state))[0]

        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else:
            _, action = torch.max(q, -1)
            action = action.item()
        new_state, reward, terminal, _ = env.step(action)
        show_state = new_state
        new_state = np.expand_dims(new_state, 1)
        new_state = state.reshape(1, 3, 15, 19)

        q_new = approx.forward(torch.from_numpy(new_state).type(torch.FloatTensor))[0]  # " find Q (s', a') "
        #  find optimal action Q value for next step
        _, new_max = torch.max(q_new, -1)
        new_max = new_max.item()

        q_target = q.clone()
        q_target = Variable(q_target.data)

        #  update target value function according to TD
        q_target[action] = reward + torch.mul(new_max, gamma)  # " reward + gamma*(max(Q(s', a')) "

        loss = loss_fn(q, q_target)  # " reward + gamma*(max(Q(s', a')) - Q(s, a)) "
        # Update original policy according to Q_target ( supervised learning )
        approx.zero_grad()
        loss.backward()
        optimizer.step()

        #  Q and Q_target should converge
        if time % 100 == 0:
            state = torch.FloatTensor(show_state).permute(1, 2, 0).cpu().numpy()

            plt.subplot(131)
            plt.title("Imagined")
            plt.imshow(state)
            plt.subplot(132)
            plt.title("Actual")
            plt.imshow(state)
            plt.show(block=False)
            plt.pause(0.000001)

        if keyboard.is_pressed('1'):
            torch.save(approx.state_dict(), 'trained-10000.mdl')
        if keyboard.is_pressed('9'):
            torch.save(approx.state_dict(), 'trained-10000.mdl')

        if terminal or tick > 100:
            plt.close()
            break

        state = new_state


torch.save(approx.state_dict(), 'trained-10000.mdl')