PyTorch CNN doesn't update weights while training

meatball1982 · September 6, 2022, 8:02am

I want to predict a 8x8 matrix with the original 8x8 matrix. But the weights DO NOT update in the training process.

I use two simple conv layers to conv input matrix from 1x8x8 to 2x8x8. Then I used another conv layer to convert 2x8x8 to 1x8x8. The inputs and outputs in the data folder are generated randomly. The pytorch codes are shown as follows.

I have already checked some posts about weights not update issues. I think there must be some wrong with “requires_grad = True” of data or loss.backward(). Any suggestions about the codes would be grateful. Thanks in advance.

The data input folder is in
data/CM10_1/CM_1.txt
data/CM10_1/CM_2.txt
data/CM10_1/CM_3.txt
data/CM10_1/CM_4.txt

The data output folder is in
data/CM10_2/CM_1.txt
data/CM10_2/CM_2.txt
data/CM10_2/CM_3.txt
data/CM10_2/CM_4.txt

data/CM_info_tr.csv

CMname,
CM_1.txt,
CM_2.txt,
CM_3.txt,
CM_4.txt,

CM_i.txt is shown as

207 244 107 173 70 111 180 244
230 246 233 193 11 97 192 86
32 40 202 189 24 195 70 149
232 247 244 100 209 202 173 57
161 244 167 167 177 47 167 191
24 123 9 43 80 124 41 65
71 204 216 180 242 113 30 129
139 36 238 8 8 164 127 178

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# outline###############################################################
# 
#  CM10_1/CM_i.txt to predict CM10_2/CM_i.txt
#  
#  data pair example
#  CM10_1/CM_1.txt -> CM10_2/CM_1.txt
# 
#  CM10_1/CM_1.txt is 8x8 matrix with random int
#  CM10_2/CM_1.txt is 8x8 matrix with random int
# 
#  The model uses two conv layers 
#  layer 01 : 1x8x8 -> 2x8x8
#  layer 02 : 2x8x8 -> 1x8x8
#  
#  The loss is the difference between 
#  CM10_2/CM_1.txt(predicted) and CM10_2/CM_1.txt
#  

# main ###############################################################

from __future__       import print_function, division

import os
import sys
import torch
import pandas              as pd
import numpy               as np
import torch.nn.functional as F

from skimage          import io, transform

from torch.utils.data import Dataset, DataLoader
from torch            import nn
from torch.autograd   import Variable

torch.cuda.empty_cache()

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# test CM parameters
n_Ca   = 8
batch_size = 4

#device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cpu"

# define class dataset CMDataset ###################################################
class CMDataset(Dataset):
    """CM dataset"""
    def __init__(self,csv_CM,CM_beg_dir,CM_end_dir,n_Ca=n_Ca):
        """
        Args:
        csv_CM       (string): Path to the csv file with CM class.
        CM_beg_dir   (string): Directory with all the CM begin data.
        CM_end_dir   (string): Directory with all the CM end data.
        """
        self.CM_info    = pd.read_csv(csv_CM)
        self.CM_beg_dir = CM_beg_dir
        self.CM_end_dir = CM_end_dir

    def __len__(self):
        return len(self.CM_info)# the number of the samples

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        #load and convert CM begin data ---------------------------------------
        CM_beg_path  = os.path.join(self.CM_beg_dir, self.CM_info.iloc[idx, 0])
        CM_beg_data  = np.loadtxt(CM_beg_path)
        CM_beg_data  = CM_beg_data.reshape(1,n_Ca,n_Ca)
        CM_beg_data  = CM_beg_data.astype(np.float32)
        CM_beg_data  = torch.from_numpy(CM_beg_data)
        CM_beg_data  = CM_beg_data.to(device)
        
        #load and convert CM endin data ---------------------------------------
        CM_end_path  = os.path.join(self.CM_end_dir, self.CM_info.iloc[idx, 0])
        CM_end_data  = np.loadtxt(CM_end_path)
        CM_end_data  = CM_end_data.reshape(1,n_Ca,n_Ca)
        CM_end_data  = CM_end_data.astype(np.float32)
        CM_end_data  = torch.from_numpy(CM_end_data)
        CM_end_data  = CM_end_data.to(device)
        
        return CM_beg_data, CM_end_data


# define class model CMNet ###################################################
class CMNet(nn.Module):
    def __init__(self):
        super(CMNet, self).__init__()
        self.lay_CM_01 = nn.Conv2d(in_channels=1,out_channels=2,kernel_size=1,stride=1,bias=True)
        self.lay_CM_02 = nn.Conv2d(in_channels=2,out_channels=1,kernel_size=1,stride=1,bias=True)


    def forward(self, CM_data): 

        [n_in_batch,n_in_chan,n_in_hei,n_in_wid]=CM_data.shape
        n_Ca = n_in_hei

        out1_1 = self.lay_CM_01(CM_data)
        out1_2 = out1_1 
        out1_3 = self.lay_CM_02(out1_2)
        out = out1_3

        return out	

# load data for training and validing
CM_dataset_train = CMDataset(csv_CM     = 'data/CM_info_tr.csv',
                             CM_beg_dir = 'data/CM10_1/',
                             CM_end_dir = 'data/CM10_2/',
                             n_Ca       =  n_Ca)

train_dataloader = DataLoader(CM_dataset_train, 
                              batch_size=batch_size, 
                              shuffle=True)

# training parameter 
learning_rate = 2
epochs        = 5

model = CMNet()
model = model.to(device)

# Initialize the loss function
loss_fn = nn.MSELoss(reduction='mean')

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# define train loop ###############################################################
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X1,Y) in enumerate(dataloader):
        X1=X1.to(torch.float32)
        Y = Y.to(torch.float32)
        
        # Compute prediction and loss
        X1=torch.autograd.Variable(X1)
        pred = model(X1)
        pred = torch.autograd.Variable(pred)

        # compute loss 
        loss = loss_fn(pred,Y)
        loss = Variable(loss, requires_grad = True)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loss, current = loss.item(), batch * len(X1)
        print(f" loss:{loss:>15f},  [{current:>5d}/{size:>5d}]")

# Train ###############################################################
for t in range(epochs):
    print(f"Epoch {t+1}\n----------------------------------------------")
#    print(list(model.parameters()))
    train_loop(train_dataloader, model, loss_fn, optimizer)

#print("Train and Valid Done!")

ptrblck · September 6, 2022, 8:35am

Don’t re-wrap a tensor attached to the computation graph into a new tensor as it will detach it:

loss = Variable(loss, requires_grad = True)

Remove this line of code and also remove the usage of Variable as it’s deprecated since PyTorch 0.4.

meatball1982 · September 7, 2022, 1:00am

@ptrblck
Thank you for your reply. After deleting those two lines, it works.
I will follow the latest docs to build my model.