Pytorch model accuracy stays almost the same and loss oscillating

Hi!

I created a model to classify chess positions as a good move for black or white

I tried training the model but the validation accuracy almost stays constant and the loss is oscillating

A log of the Accuracy and loss:

Epoch 0 Accuracy 36.0772 Loss 4.301493
Epoch 1 Accuracy 36.0518 Loss 3.896801
Epoch 2 Accuracy 36.0772 Loss 4.297735
Epoch 3 Accuracy 36.1280 Loss 3.851907
Epoch 4 Accuracy 36.0010 Loss 3.515308
Epoch 5 Accuracy 36.0772 Loss 3.375627
Epoch 6 Accuracy 36.0264 Loss 3.823340
Epoch 7 Accuracy 36.0518 Loss 3.453476
Epoch 8 Accuracy 36.1026 Loss 3.087875
Epoch 9 Accuracy 36.0518 Loss 4.504724
Epoch 10 Accuracy 36.1535 Loss 3.944135
Epoch 11 Accuracy 35.9502 Loss 4.940733
Epoch 12 Accuracy 36.0518 Loss 4.199282
Epoch 13 Accuracy 36.1280 Loss 4.538399
Epoch 14 Accuracy 36.0010 Loss 3.787019
Epoch 15 Accuracy 35.9756 Loss 3.599001
Epoch 16 Accuracy 36.0010 Loss 4.977734
Epoch 17 Accuracy 35.9756 Loss 4.441586
Epoch 18 Accuracy 36.1026 Loss 4.767806
Epoch 19 Accuracy 36.1535 Loss 4.689194
Epoch 20 Accuracy 36.2043 Loss 3.324705
Epoch 21 Accuracy 36.0264 Loss 3.859550
Epoch 22 Accuracy 36.0772 Loss 3.823349
Epoch 23 Accuracy 36.0264 Loss 3.766785
Epoch 24 Accuracy 36.0772 Loss 3.660792
Epoch 25 Accuracy 36.1026 Loss 4.057627
Epoch 26 Accuracy 36.0264 Loss 4.357696

model.py

class Network(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1,64,4,2,1)
        self.conv2 = nn.Conv2d(64,64,2,1,1)

        self.fc1 = nn.Linear(64*5*5,1024)
        self.fc2 = nn.Linear(1024,3)

    def forward(self,x):
        x = x.view(-1,1,8,8)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))

        x = x.view(-1,64*25)

        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return x

    def load(f):
        self.state_dict = torch.load(f).state_dict()
        self.eval()

train.py

import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader,ConcatDataset

torch.manual_seed(0)
from data import Data
from model import Network
from utils import *
net = Network()
total_train = []
total_test = []
dir_='data/processed/'
for f in os.listdir(dir_):
    train = Data(dir_+f,train=True)
    test = Data(dir_+f,train=False)
    total_train.append(train)
    total_test.append(test)

train = ConcatDataset(total_train)
test = ConcatDataset(total_test)

trainloader = DataLoader(train,batch_size=128,shuffle=True)
testloader = DataLoader(test,batch_size=32,shuffle=True)

print(f'Name {" "*10}|len\nTest {" "*10}|{len(test)}\nTrin {" "*10}|{len(train)}')

epochs = 500
lr = 0.0001

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters())
pacc = 0
for e in range(epochs):
    net.train()
    for x,y in trainloader:
        optimizer.zero_grad()
        p = net(x)
        loss = loss_fn(p,torch.argmax(y,dim=1))
        optimizer.step()
    net.eval()
    acc = []
    with torch.no_grad():
        for x,y in testloader:
            x = x.unsqueeze(1)
            p = net(x)
            a = accuracy(p,torch.argmax(y,dim=1))
            acc.append(a)
        acc = torch.mean(torch.Tensor(acc))

    print(f'Epoch {e} Accuracy {acc:.4f} Loss {loss:5f}')
    if acc > pacc:
        torch.save(net,f'acc_{acc:.4f}')
        pacc = acc

the data is chess data from the “figs game database”

it is preprocessed as follows

import bz2
import re
import chess
import numpy as np
import bz2
import os

class DataBuilder:
    def __init__(self,dir_,save_dir='data/processed'):
        self.dir = dir_
        self.data={'x':[],'y':[]}
        self.save_dir = save_dir


    def remove_metadata(self,f):
        print(type(f))
        for idx,line in enumerate(f.split('\n')):
            if line[:2] == '1.' and "forfeits by disconnection" not in line:
                end = line.index('{')
                pline = line[:end].replace('+','').replace('#','')
                pline = re.sub(r'[1-9][0-9]*\.\s', '', pline)
                result = float(0 if '1-0' in line[end:]
                                    else 1 if '0-1' in line[end:]
                                    else 0.5)
                self.data['x'].append(pline)
                self.data['y'].append(result)


    
    def read_bz2(self,f):
        f = bz2.open(f,'r')
        f = f.read().decode()
        return f

    def read(self,f):
        f = open(f,'r')
        f = f.read()
        return f

    
    def one_hot(self):
        piece_dict = {
            'p':-1,
            'r':-5,
            'n':-3,
            'b':-3.5,
            'q':-9,
            'k':-999,
            'P':1,
            'R':5,
            'N':3,
            'B':3.5,
            'Q':9,
            'K':999,
            '.':0
        }

        result_dict = {
            0:[1,0,0],
            1:[0,0,1],
            0.5:[0,1,0]
        }


        def make_matrix(board):
            board = board.epd().split(' ',1)[0].split('/')
            b = []
            for row in board:
                for cell in row:
                    if cell.isdigit():
                        for _ in range(int(cell)):
                            b.append(piece_dict['.'])
                    else:
                        b.append(piece_dict[cell])
            return b


        for idx,(x,y) in enumerate(zip(self.data['x'],self.data['y'])):
            board = chess.Board()
            moves = x.split()
            for move in moves:
                board.push_san(move)
                self.data['x'][idx] = make_matrix(board)
                self.data['y'][idx] = result_dict[y]


    def process(self):
        for f in os.listdir(self.dir):
            if f.endswith('bz2'):
                self.remove_metadata(self.read_bz2(self.dir+f))
            else:
                self.remove_metadata(self.read(self.dir+f))
            self.one_hot()
            self.save(f)
            self.data={'x':[],'y':[]}

    
    def save(self,f):
        np.save(f'{self.save_dir}/{f}_processed',self.data)


builder = DataBuilder('data/raw/')
builder.process()

torch dataset

import numpy as np
import torch
from torch.utils.data import Dataset


class Data(Dataset):
    def __init__(self,data=None,train=True,tratio=0.005):
        
        data = np.load(data,allow_pickle=1).tolist()
        self.x = data['x']
        self.y = data['y']
        tnum = int(tratio*len(self.y))

        if train:
            self.x = self.x[:len(self.y)-tnum]
            self.y = self.y[:len(self.y)-tnum]
        else:
            self.x = self.x[-tnum:]
            self.y = self.y[-tnum:]


    def __getitem__(self,idx):
        x = torch.Tensor(self.x[idx]).T
        y = torch.Tensor(self.y[idx])

        return x,y
    
    def __len__(self):
        return len(self.y)

utils.py

def accuracy(p,y):
    return round(100*(sum(torch.argmax(p,dim=1) == y)/len(y)).item(),4)

I would be glad if somone checks this out

I have been working on this for three days. I tried changing the model architecture and changing the learning rate

It seems that there is no loss.backward() between the loss calculation and optimizer.step().

https://github.com/pytorch/examples/blob/master/mnist/main.py#L43-L45 can be a reference.

Oh thanks. That was a clumsy mistake
I added loss.backward
Better or worse??

Epoch 1 Accuracy 49.0854 Loss 1.098612
Epoch 2 Accuracy 49.1870 Loss 1.098612
Epoch 3 Accuracy 49.2378 Loss 1.098612
Epoch 4 Accuracy 49.1362 Loss 1.098612
Epoch 5 Accuracy 49.1870 Loss 1.098612
Epoch 6 Accuracy 49.2124 Loss 1.098612
Epoch 7 Accuracy 49.2124 Loss 1.098612
Epoch 8 Accuracy 49.1870 Loss 1.098612
Epoch 9 Accuracy 49.1616 Loss 1.098612
Epoch 10 Accuracy 49.1870 Loss 1.098612
Epoch 11 Accuracy 49.2124 Loss 1.098612
Epoch 12 Accuracy 49.3394 Loss 1.098612
Epoch 13 Accuracy 49.2632 Loss 1.098612
Epoch 14 Accuracy 49.2378 Loss 1.098612
Epoch 15 Accuracy 49.1616 Loss 1.098612
Epoch 16 Accuracy 49.2632 Loss 1.098612
Epoch 17 Accuracy 49.3394 Loss 1.098612
Epoch 18 Accuracy 49.3648 Loss 1.098612
Epoch 19 Accuracy 49.1616 Loss 1.098612
Epoch 20 Accuracy 49.1870 Loss 1.098612
Epoch 21 Accuracy 49.1108 Loss 1.098612
Epoch 22 Accuracy 49.2632 Loss 1.098612
Epoch 23 Accuracy 49.1870 Loss 1.098612
Epoch 24 Accuracy 49.2632 Loss 1.098612
Epoch 25 Accuracy 49.2632 Loss 1.098612
Epoch 26 Accuracy 49.2886 Loss 1.098612
Epoch 27 Accuracy 49.1870 Loss 1.098612

The loss is now constant and the accuracy doesnt increase or reduce

Network Architecture

class Network(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1,64,4,2,1)
        self.conv2 = nn.Conv2d(64,64,2,1,1)

        self.fc1 = nn.Linear(64*5*5,1024)
        self.fc2 = nn.Linear(1024,3)

    def forward(self,x):
        x = x.view(-1,1,8,8)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))

        x = x.view(-1,64*25)

        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return x

    def load(f):
        self.state_dict = torch.load(f).state_dict()
        self.eval()

Train loop

trainloader = DataLoader(train,batch_size=128,shuffle=True)
testloader = DataLoader(test,batch_size=32,shuffle=True)
print(f'Name {" "*10}|len\nTest {" "*10}|{len(test)}\nTrin {" "*10}|{len(train)}')

epochs = 500
lr = 0.001

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(),lr=lr)
pacc = 0
for e in range(epochs):
    net.train()
    for x,y in trainloader:
        y=torch.argmax(y,dim=1)
        optimizer.zero_grad()
        p = net(x)
        loss = loss_fn(p,y)
        loss.backward()
        optimizer.step()
    net.eval()
    acc = []
    with torch.no_grad():
        for x,y in testloader:
            x = x.unsqueeze(1)
            p = net(x)
            a = accuracy(p,torch.argmax(y,dim=1))
            acc.append(a)
        acc = torch.mean(torch.Tensor(acc))

    print(f'Epoch {e+1} Accuracy {acc:.4f} Loss {loss:5f}')
    if acc > pacc:
        torch.save(net,f'acc_{acc:.4f}')
        pacc = acc

Removing F.relu from the last layer of forward pass will do the job…