Speed up CNN pytorch

Help me please, how to speed up my the algorithm processing on windows 10 with 32 cpus and 64 ram, which takes 30 minutes for each iteration of 10 epoch, i have done the following:

  1. enter code clausule<< if >>for windows 10
    2.I use num_workers = 2 with pin_memory = false, this worked better for me in comparison, bachsize = 10, I have a worker algorithm with 24 processors (pool)

how can i vectorize my algorithm??

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms # add models to the list
import os
import seaborn as sn  # for heatmaps
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import pandas as pd
import openpyxl

# ignore harmless warnings
import warnings
warnings.filterwarnings("ignore")


def run(device):
    root = '../'
    excel_file = openpyxl.load_workbook('corridas/corridas.xlsx')
    count=20
    mm=2

    for cor in range(2,1983): 
    
    
            print ("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
            print ("        Modelo a correr")
            print ("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
            excel_sheet = excel_file['corrida']
            convo = int(excel_sheet.cell(row=cor, column=2).value)
            kernel = int(excel_sheet.cell(row=cor, column=3).value)
            bias=int(excel_sheet.cell(row=cor, column=4).value)
            size=excel_sheet.cell(row=cor, column=5).value
            image=int(size)
            type=int(excel_sheet.cell(row=cor, column=6).value)
            tipo_image=int(type)
            biass=bool(bias)
            #constantes
            paddingg=1
            stridee=1
        
            if tipo_image==1:
                train_transform = transforms.Compose([
                    transforms.Grayscale(num_output_channels=1),
                    transforms.RandomRotation(10,fill=(0,)),      # rotate +/- 10 degrees
                    transforms.RandomHorizontalFlip(),  # reverse 50% of images
                    transforms.Resize(image),             # resize shortest side to 224 pixels
                    transforms.CenterCrop(image),         # crop longest side to 224 pixels at center
                    transforms.ToTensor(),
                    transforms.Normalize([0.4161,],[0.1688,])
                ])
            
                test_transform = transforms.Compose([
                    transforms.Grayscale(num_output_channels=1),
                    transforms.Resize(image),
                    transforms.CenterCrop(image),
                    transforms.ToTensor(),
                    transforms.Normalize([0.4161,],[0.1688,])
                ])
                inv_normalize = transforms.Normalize(
                 mean=[0.5], std=[0.5]
                )
                print("gray")
                
            
            else:
                train_transform = transforms.Compose([
                    transforms.RandomRotation(10),      # rotate +/- 10 degrees
                    transforms.RandomHorizontalFlip(),  # reverse 50% of images
                    transforms.Resize(image),             # resize shortest side to 224 pixels
                    transforms.CenterCrop(image),         # crop longest side to 224 pixels at center
                    transforms.ToTensor(),
                    transforms.Normalize([0.485, 0.456, 0.406],
                                         [0.229, 0.224, 0.225])
                ])
            
                test_transform = transforms.Compose([
                    transforms.Resize(image),
                    transforms.CenterCrop(image),
                    transforms.ToTensor(),
                    transforms.Normalize([0.485, 0.456, 0.406],
                                         [0.229, 0.224, 0.225])
                ])
                
                inv_normalize = transforms.Normalize(
                mean=[-0.485/0.229, -0.456/0.224, -0.406/0.225],
                std=[1/0.229, 1/0.224, 1/0.225]
                )
                
                print("RGB")
                
            
            train_data = datasets.ImageFolder(os.path.join(root, 'train_real'), transform=train_transform)
            test_data = datasets.ImageFolder(os.path.join(root, 'validation'), transform=test_transform)
            
            
            
            torch.manual_seed(42)
            

            train_loader = DataLoader(train_data, batch_size=10,num_workers=2, pin_memory=False,shuffle=True)
            test_loader = DataLoader(test_data, batch_size=10, num_workers=2,pin_memory=False,shuffle=True)
            
            #obtiene los labels o clases del dataset
            class_names = train_data.classes
            
            print(class_names)
            print(f'Training images available: {len(train_data)}')
            print(f'Testing images available:  {len(test_data)}')
            
            
            
            
            i=1
            f=image#tamaño original de la imagen
            ###Calcular la dimension del output RELU
            for i in range(1, convo+1):
                f=(((f-kernel)+(2*paddingg))/stridee)+1
                f=f/2
            f=int(f)
            print (f)
            
            if ( convo == 2 and (kernel==3 or kernel==5) and (biass==False or biass ==True)):
                class ConvolutionalNetwork(nn.Module):
                    def __init__(self):
                        super().__init__()
                        self.conv1 = nn.Conv2d(tipo_image, 6, kernel_size=(kernel,kernel), stride=stridee,padding=paddingg,bias=biass)
                        self.conv2 = nn.Conv2d(6, 16, kernel_size=(kernel,kernel), stride=stridee,padding=paddingg,bias=biass)
            
                    
                        self.fc1 = nn.Linear(f*f*16, 120)
                        self.fc2 = nn.Linear(120, 84)
                        self.fc3 = nn.Linear(84, 18)#este 18 son las 18 clases
            
                    def forward(self, X):
                        X = F.relu(self.conv1(X))
                        X = F.max_pool2d(X, 2, 2)#max pooling de 2x2
                        X = F.relu(self.conv2(X))
                        X = F.max_pool2d(X, 2, 2)
                        X = X.view(-1, f*f*16)
                        X = F.relu(self.fc1(X))
                        X = F.relu(self.fc2(X))
                        X = self.fc3(X)
                        return F.log_softmax(X, dim=1)
                torch.manual_seed(101)
                CNNmodel = ConvolutionalNetwork()
                CNNmodel=CNNmodel.share_memory()
                criterion = nn.CrossEntropyLoss()
                optimizer = torch.optim.Adam(CNNmodel.parameters(), lr=0.001)
                print(CNNmodel)
            
                import time
                start_time = time.time()
                
                epochs = 10
                
                max_trn_batch = 800
                max_tst_batch = 300
                
                train_losses = []
                test_losses = []
                train_correct = []
                test_correct = []
                
                for i in range(epochs):
                    trn_corr = 0
                    tst_corr = 0
                    
                    # Run the training batches
                    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
                    for b, (X_train, y_train) in enumerate(train_loader):
                        
                        # Limit the number of batches
                        if b == max_trn_batch:
                            break
                        b+=1
                        X_train = X_train.to(device)
                        y_train = y_train.to(device)
                        
                        # Apply the model
                        y_pred = CNNmodel(X_train)
                        loss = criterion(y_pred, y_train)
                 
                        # Tally the number of correct predictions
                        predicted = torch.max(y_pred.data, 1)[1]
                        batch_corr = (predicted == y_train).sum()
                        trn_corr += batch_corr
                        
                        # Update parameters
                        optimizer.zero_grad()
                        loss.backward()
                        optimizer.step()
                
                        # Print interim results
                        if b%200 == 0:
                            print(f'epoch: {i:2}  batch: {b:4} [{10*b:6}/8000]  loss: {loss.item():10.8f}  \
                                  accuracy: {trn_corr.item()*100/(10*b):7.3f}%')
                
                    train_losses.append(loss)
                    train_correct.append(trn_corr)
                
                    # Run the testing batches
                    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
                    with torch.no_grad():
                        for b, (X_test, y_test) in enumerate(test_loader):
                            X_test = X_test.to(device)
                            y_test = y_test.to(device)
                            # Limit the number of batches
                            if b == max_tst_batch:
                                break
                
                            # Apply the model
                            y_val = CNNmodel(X_test)
                
                            # Tally the number of correct predictions
                            predicted = torch.max(y_val.data, 1)[1] 
                            tst_corr += (predicted == y_test).sum()
                
                    loss = criterion(y_val, y_test)
                    test_losses.append(loss)
                    test_correct.append(tst_corr)
                
                stop=int(time.time() - start_time)
                 ############## confusion matrix##################
                device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
                # Initialize the prediction and label lists(tensors)
                predlist=torch.zeros(0,dtype=torch.long, device='cpu')
                lbllist=torch.zeros(0,dtype=torch.long, device='cpu')
            
                with torch.no_grad():
                    for i, (inputs, classes) in enumerate(test_loader):
                        inputs = inputs.to(device)
                        classes = classes.to(device)
                        outputs = CNNmodel(inputs)
                        _, preds = torch.max(outputs, 1)
                        
                        # Append batch prediction results
                        predlist=torch.cat([predlist,preds.view(-1).cpu()])
                        lbllist=torch.cat([lbllist,classes.view(-1).cpu()])
            
                
                    
                from sklearn.metrics import  f1_score, precision_score, recall_score
                if count<=100:
                    excel_sheet = excel_file['acc']
                    excel_sheet.cell(row=mm, column=count).value = round(test_correct[-1].item()*100/3000,2)
                    excel_sheet = excel_file['prc']
                    excel_sheet.cell(row=mm, column=count).value =round(precision_score(lbllist.numpy(), predlist.numpy(), average="weighted")*100,2)
                    excel_sheet = excel_file['recall']
                    excel_sheet.cell(row=mm, column=count).value =round(recall_score(lbllist.numpy(), predlist.numpy(), average="weighted")*100,2)
                    excel_sheet = excel_file['f1']
                    excel_sheet.cell(row=mm, column=count).value =round(f1_score(lbllist.numpy(), predlist.numpy(), average="weighted")*100,2)
                    excel_sheet = excel_file['times']
                    excel_sheet.cell(row=mm, column=count).value =stop
                   
                    count=count+1
        
                else: 
                    count=2
                    mm=mm+1  
                    
    
                excel_file.save('corridas/corridas.xlsx')

Answered here.
Have you profiled the code and checked, where the bottleneck are?

yes, with num_workers=2 and pin_memory=false improve the process, but I don’t understand why, since when I put more workers it slows down

Have a look at this post from @rwightman. In particular this section:

Beyond an optimal number (experiment!), throwing more worker processes at the IOPS barrier WILL NOT HELP, it’ll make it worse. You’ll have more processes trying to read files at the same time, and you’ll be increasing the shared memory consumption by significant amounts for additional queuing, thus increasing the paging load on the system and possibly taking you into thrashing territory that the system may never recover from