# CNN not training

I am completely new to CNN’s, and I do not quite know how to design or use them efficiently. That being said, I am attempting to build a CNN that learns to play Pac-man with reinforcement learning. I have trained it for about 3 hours and have seen little to no improvement. My observation space is 3 channels * 15 * 19, and there are 5 actions. Here is my code, I am open to any and all suggestions. Thanks for all your help.

``````from minipacman import MiniPacman as pac
from torch import nn
import torch
import random
import torch.optim as optimal
from torch.autograd import Variable
import matplotlib.pyplot as plt
import numpy as np
import keyboard

loss_fn = nn.MSELoss()
epsilon = 1
env = pac("regular", 1000)
time = 0
action = random.randint(0, 4)
q = np.zeros(3)
alpha = 0.01
gamma = 0.9
tick = 0
decay = 0.9999

class Value_Approximator (nn.Module):
def __init__(self):
super(Value_Approximator, self).__init__()
# Convolution 1
self.cnn1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=5, stride=1, padding=2)
self.relu1 = nn.ReLU()

# Max pool 1
self.maxpool1 = nn.MaxPool2d(kernel_size=2)

# Convolution 2
self.cnn2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=5, stride=1, padding=2)
self.relu2 = nn.ReLU()

# Max pool 2
self.maxpool2 = nn.MaxPool2d(kernel_size=2)

# Fully connected 1 (readout)
self.fc1 = nn.Linear(384, 5)

def forward(self, x):
# Convolution 1
out = self.cnn1(x)
out = self.relu1(out)

# Max pool 1
out = self.maxpool1(out)

# Convolution 2
out = self.cnn2(out)
out = self.relu2(out)

# Max pool 2
out = self.maxpool2(out)

# Resize
# Original size: (100, 32, 7, 7)
# out.size(0): 100
# New out size: (100, 32*7*7)
out = out.view(out.size(0), -1)

# Linear function (readout)
out = self.fc1(out)

return out

approx = Value_Approximator()
optimizer = optimal.SGD(approx.parameters(), lr=alpha)

while time < 50000:
print("Time: "+str(time))
print("Epsilon: "+str(epsilon))
print()
time += 1
state = env.reset()
tick = 0

epsilon *= decay

if epsilon < 0.1:
epsilon = 0.1

while True:
tick += 1
state = np.expand_dims(state, 1)
state = state.reshape(1, 3, 15, 19)
q = approx.forward(torch.from_numpy(state))

if random.uniform(0, 1) < epsilon:
action = env.action_space.sample()
else:
_, action = torch.max(q, -1)
action = action.item()
new_state, reward, terminal, _ = env.step(action)
show_state = new_state
new_state = np.expand_dims(new_state, 1)
new_state = state.reshape(1, 3, 15, 19)

q_new = approx.forward(torch.from_numpy(new_state).type(torch.FloatTensor))  # " find Q (s', a') "
#  find optimal action Q value for next step
_, new_max = torch.max(q_new, -1)
new_max = new_max.item()

q_target = q.clone()
q_target = Variable(q_target.data)

#  update target value function according to TD
q_target[action] = reward + torch.mul(new_max, gamma)  # " reward + gamma*(max(Q(s', a')) "

loss = loss_fn(q, q_target)  # " reward + gamma*(max(Q(s', a')) - Q(s, a)) "
# Update original policy according to Q_target ( supervised learning )
loss.backward()
optimizer.step()

#  Q and Q_target should converge
if time % 100 == 0:
state = torch.FloatTensor(show_state).permute(1, 2, 0).cpu().numpy()

plt.subplot(131)
plt.title("Imagined")
plt.imshow(state)
plt.subplot(132)
plt.title("Actual")
plt.imshow(state)
plt.show(block=False)
plt.pause(0.000001)

if keyboard.is_pressed('1'):
torch.save(approx.state_dict(), 'trained-10000.mdl')
if keyboard.is_pressed('9'):
torch.save(approx.state_dict(), 'trained-10000.mdl')

if terminal or tick > 100:
plt.close()
break

state = new_state

torch.save(approx.state_dict(), 'trained-10000.mdl')``````