I am using AlphaZero to program a connect four game. Whenever I input the board, the policy and value outputs stay the same. Even when I input random data, the output never changes. I added print statements after the first convolutional and after the ResNet blocks and they output random values. When I input random data into the Policy and Value heads, they output random data. But when I run the model as it is meant to be run, the output stays the same. Here is my model code:
import torch
from torch import nn
class Block(nn.Module):
def __init__(self, num_channels):
super(Block, self).__init__()
self.conv = nn.Conv2d(num_channels, num_channels, kernel_size=3, padding=1)
self.relu = nn.ReLU()
self.conv2 = nn.Conv2d(num_channels, num_channels, kernel_size=3, padding=1)
self.batch_norm = nn.BatchNorm2d(num_channels)
def forward(self, x):
i = x
# print("Shape of x before the block", x.shape)
#x = self.batch_norm(x)
x = self.relu(x)
x = self.conv(x)
# print("Shape of x after the first conv", x.shape)
#x = self.batch_norm(x)
x = self.relu(x)
x = self.conv2(x)
# print("Shape of x after the second conv", x.shape)
return x + i
class PolicyHead(nn.Module):
def __init__(self, num_channels, num_actions):
super(PolicyHead, self).__init__()
self.num_channels = num_channels
self.conv = nn.Conv2d(num_channels, 2, kernel_size=1)
self.relu = nn.ReLU()
self.batch_norm = nn.BatchNorm2d(2)
self.fc = nn.Linear(in_features=84, out_features=num_actions)
def forward(self, x):
x = x.reshape(-1, self.num_channels, 6, 7)
x = self.conv(x)
#x = self.batch_norm(x)
x = self.relu(x)
x = torch.flatten(x, 1)
return self.fc(x)
class ValueHead(nn.Module):
def __init__(self, num_channels):
super(ValueHead, self).__init__()
self.num_channels = num_channels
self.conv = nn.Conv2d(num_channels, 2, kernel_size=1)
self.relu = nn.ReLU()
self.batch_norm = nn.BatchNorm2d(2)
self.fc = nn.Linear(84, num_channels)
self.fc2 = nn.Linear(num_channels, 1)
def forward(self, x):
x = x.reshape(-1, self.num_channels, 6, 7)
x = self.conv(x)
#x = self.batch_norm(x)
x = self.relu(x)
x = torch.flatten(x, 1)
x = self.fc(x)
x = self.relu(x)
x = self.fc2(x)
x = torch.tanh(x)
return x
class AlphaZero(nn.Module):
def __init__(self, num_channels=128, num_actions=7, num_blocks=20):
super(AlphaZero, self).__init__()
self.num_channels = num_channels
self.conv = nn.Conv2d(1, num_channels, kernel_size=3, padding=1)
self.blocks = nn.Sequential(*[Block(num_channels) for _ in range(num_blocks)])
self.policy_head = PolicyHead(num_channels, num_actions)
self.value_head = ValueHead(num_channels)
def forward(self, x):
x = x.view(-1, 1, 6, 7)
x = self.conv(x)
x = self.blocks(x)
# print("Shape of x before the view", x.shape)
x = x.view(-1, self.num_channels)
# print("Shape of x after the view", x.shape)
return self.policy_head(x), self.value_head(x)
Here is some code I used to test it:
weights = torch.load("./model-880", weights_only=True)
model = AlphaZero(num_channels=32, num_actions=7, num_blocks=8).to("cuda")
model.load_state_dict(weights['model_state_dict'])
x = torch.randint(10, (1, 6, 7), dtype=torch.float).to("cuda")
print(x)
print(model(x))
During training, the loss fluctuated.