Neural network non-convergence

iif_eve · May 25, 2022, 2:55am

beginner for pytorch, I switched from another framework, and try to get started with a net I constructed before. but the net won’t convergence. since I can’t locate the problem, so I post almost everything… please forgive me…

here is the results after 10 epoch, it looks like net already did it’s best, but the loss is still so large, it seems like I should modify my net or something, but it works great on the other frame work, so I’m pretty sure there is something wrong and I didn’t notice
the number 111 means there is 111 batch in epoch

[1,   111] loss: 2.029
[2,   111] loss: 1.359
[3,   111] loss: 1.354
[4,   111] loss: 1.356
[5,   111] loss: 1.357
[6,   111] loss: 1.359
[7,   111] loss: 1.358
[8,   111] loss: 1.355
[9,   111] loss: 1.357
[10,   111] loss: 1.356
ended the train

here is my net structure:

import torch.nn as nn
import torch.nn.functional as F

class MyNet(nn.Module):
    def __init__(self, num_classes= 4):
        super(MyNet, self).__init__()
        self.conv1= nn.Conv2d(in_channels=3,
                               out_channels=32,
                               kernel_size= (3,3),
                               stride= 1, padding= 1)
        self.conv2= nn.Conv2d(in_channels=32,
                               out_channels=64,
                               kernel_size= (3,3),
                               stride= 2, padding= 0)
        self.conv3= nn.Conv2d(in_channels=64,
                               out_channels=64,
                               kernel_size= (3,3),
                               stride= 2, padding= 0)
        self.conv4= nn.Conv2d(in_channels=64,
                               out_channels=64,
                               kernel_size= (3,3),
                               stride= 2, padding= 1)
        self.flatten= nn.Flatten()
        self.linear1= nn.Linear(in_features= 1024, out_features= 64)
        self.linear2= nn.Linear(in_features= 64, out_features= num_classes)


    def forward(self, x):
        x= F.relu(self.conv1(x))
        x= F.relu(self.conv2(x))
        x= F.relu(self.conv3(x))
        x= F.relu(self.conv4(x))
        x= self.flatten(x)
        x= F.relu(self.linear1(x))

        return F.log_softmax(self.linear2(x), dim= 1)

here is some utilities include Dataset and Dataloader

import torch
from PIL import Image
import pandas as pd
import torchvision.transforms as T
from torch.utils.data import Dataset



device = torch.device('cuda')

def get_csv(path):
    img_list = pd.read_csv(path, usecols=['id', 'class_num'])
    size = len(img_list)
    train_img_list = img_list[:int(0.8*size)]
    valid_img_list = img_list[int(0.8*size):]

    return train_img_list,valid_img_list

img_produce =T.Compose([
    T.ToTensor(),
    T.Resize(size=(32,32)),
    T.Normalize(
         mean=(0,0,0),
         std= (255,255,255)
    ),
    T.Normalize(
        mean=[0.5,0.5,0.5],
        std= [1,1,1]
    )
])

class myDataset(Dataset):
    def __init__(self, train_img, test_img, mode= 'train'):
        Dataset.__init__(self)
        self.data = []

        if(mode== 'train'):
            for img in train_img.itertuples() :
                self.data.append(['./data/lemon_lesson/train_images/'+getattr(img, 'id'), getattr(img, 'class_num')])
        else:
            for img in test_img.itertuples():
                self.data.append(['./data/lemon_lesson/test_images/'+getattr(img, 'id'), getattr(img, 'class_num')])

    def load_img(self, image_path):
        img = Image.open(image_path).convert('RGB')
        img = img_produce(img)
        img = img.to(device= device)
        return img

    def __getitem__(self, index):

        img = self.load_img(self.data[index][0])
        lable = self.data[index][1]

        return img,lable

    def __len__(self):
        return len(self.data)

here is how I train the net

import torch
from utils import myDataset
import utils
from torch.utils.data import DataLoader
from MyNet import MyNet
import torch.optim as optim
import torch.nn as nn


if __name__ == '__main__':
    train_path = './data/lemon_lesson/train_images.csv'

    train_img_csv,valid_img_csv = utils.get_csv(train_path)

    train_img = myDataset(
        train_img = train_img_csv,
        test_img= valid_img_csv,
        mode= 'train'
    )

    train_loader = DataLoader(train_img,
                              batch_size=8,
                              shuffle= True,)

    mynet= MyNet()
    device= torch.device('cuda')
    mynet.to(device= device)

    for name, para in mynet.named_parameters():
        print(name, ':', para.shape)

    criterion= nn.CrossEntropyLoss()
    optimizer= optim.Adam(params= mynet.parameters(), lr= 0.01)

    running_acc= 0
    for epoch in range(10):
        running_loss= 0.0
        running_acc = 0
        mynet.train()
        for i,data in enumerate(train_loader, start= 0):
            inputs, labels= data[0].to(device), data[1].to(device)
            optimizer.zero_grad()

            outputs= mynet(inputs)
            #_,pred= torch.max(outputs, 1)
            loss= criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss+= loss.item()

        print(f'[{epoch + 1}] loss: {running_loss/111 :.3f}'
              )
    print('ended the train')

    PATH= './result/net_aistudio.pth'
    torch.save(mynet.state_dict(), PATH)

ptrblck · May 25, 2022, 3:55am

nn.CrossEntropyLoss expects raw logits as the model output so remove the F.log_softmax or use nn.NLLLoss as the criterion alternatively.