I have pytorch problem

jasg · April 30, 2020, 12:21pm

I have a problem running pytorch I get that:
ctypes.CDLL(dll)
self._handle = _dlopen(self._name, mode)
OSError: [WinError 126] No se puede encontrar el módulo especificado
Help me please, I have system win 10 with 64bits

ptrblck · April 30, 2020, 3:13pm

Could you provide the logs from Process Monitor to narrow down the issue as described here?

jasg · April 30, 2020, 8:02pm

thanks, but I have other problem, run CNN pytorch in server with 64 gb ram and 32 cores, but consume memory RAM 24%, I use next code:

import test_model_win_auto_fila
import torch

if __name__ == "__main__": 
    used_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    num_processors = 10
    p=Pool(processes = num_processors)
    p.map(test_model_win_auto_fila.run(used_device), [i for i in range(0,11)])

ptrblck · April 30, 2020, 11:34pm

This might be expected, if each process load e.g. the data into the system RAM.
Could you explain this issue a bit more any what the expected memory usage is?

jasg · May 1, 2020, 12:15am

I am running with windows 10, 64 bits, the cluster has 32 cores and 64 gb RAM, they are 2000 iterations and in each one it reads 18000 images that it processes, but each iteration takes 30 minutes, how to improve this

ptrblck · May 1, 2020, 12:18am

You could profile the code and check the bottlenecks.
Once you see where the bottleneck is (e.g. data loading, processing, model forward/backward), you could try to check why the bottleneck is in this particular part.

E.g. if your data loading is too slow, make sure to load the data from a local SSD and use multiple workers in a DataLoader.
On the other hand, your model might have inefficient code by e.g. using for loops, which might be vectorized.

It’s hard to tell how to optimize something, as the bottleneck might come from a lot of different parts.

jasg · May 1, 2020, 12:23am

for the num_workers windows10 I get an error broken pipe,
this code is called by windows multiprocessing

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms # add models to the list
import os
import seaborn as sn  # for heatmaps
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import pandas as pd
import openpyxl

# ignore harmless warnings
import warnings
warnings.filterwarnings("ignore")


def run(device):
    root = '../'
    excel_file = openpyxl.load_workbook('corridas/corridas.xlsx')
    excel_sheet = excel_file['corrida']
    acc = []
    prc=[]
    recall=[]
    f1=[]	
    times=[]
    count=2
    mm=2

    for cor in range(2,2002): 
    
    
            print ("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
            print ("        Modelo a correr")
            print ("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
            excel_sheet = excel_file['corrida']
            convo = int(excel_sheet.cell(row=cor, column=2).value)
            kernel = int(excel_sheet.cell(row=cor, column=3).value)
            bias=int(excel_sheet.cell(row=cor, column=4).value)
            size=excel_sheet.cell(row=cor, column=5).value
            image=int(size)
            type=int(excel_sheet.cell(row=cor, column=6).value)
            tipo_image=int(type)
            biass=bool(bias)
            #constantes
            paddingg=1
            stridee=1
        
            if tipo_image==1:
                train_transform = transforms.Compose([
                    transforms.Grayscale(num_output_channels=1),
                    transforms.RandomRotation(10,fill=(0,)),      # rotate +/- 10 degrees
                    transforms.RandomHorizontalFlip(),  # reverse 50% of images
                    transforms.Resize(image),             # resize shortest side to 224 pixels
                    transforms.CenterCrop(image),         # crop longest side to 224 pixels at center
                    transforms.ToTensor(),
                    transforms.Normalize([0.4161,],[0.1688,])
                ])
            
                test_transform = transforms.Compose([
                    transforms.Grayscale(num_output_channels=1),
                    transforms.Resize(image),
                    transforms.CenterCrop(image),
                    transforms.ToTensor(),
                    transforms.Normalize([0.4161,],[0.1688,])
                ])
                inv_normalize = transforms.Normalize(
                 mean=[0.5], std=[0.5]
                )
                print("gray")
                
            
            else:
                train_transform = transforms.Compose([
                    transforms.RandomRotation(10),      # rotate +/- 10 degrees
                    transforms.RandomHorizontalFlip(),  # reverse 50% of images
                    transforms.Resize(image),             # resize shortest side to 224 pixels
                    transforms.CenterCrop(image),         # crop longest side to 224 pixels at center
                    transforms.ToTensor(),
                    transforms.Normalize([0.485, 0.456, 0.406],
                                         [0.229, 0.224, 0.225])
                ])
            
                test_transform = transforms.Compose([
                    transforms.Resize(image),
                    transforms.CenterCrop(image),
                    transforms.ToTensor(),
                    transforms.Normalize([0.485, 0.456, 0.406],
                                         [0.229, 0.224, 0.225])
                ])
                
                inv_normalize = transforms.Normalize(
                mean=[-0.485/0.229, -0.456/0.224, -0.406/0.225],
                std=[1/0.229, 1/0.224, 1/0.225]
                )
                
                print("RGB")
                
            
            train_data = datasets.ImageFolder(os.path.join(root, 'train_real'), transform=train_transform)
            test_data = datasets.ImageFolder(os.path.join(root, 'validation'), transform=test_transform)
            
            
            
            torch.manual_seed(42)
            

            train_loader = DataLoader(train_data, batch_size=10, pin_memory=False,shuffle=True)
            test_loader = DataLoader(test_data, batch_size=10, pin_memory=False,shuffle=True)
            
            #obtiene los labels o clases del dataset
            class_names = train_data.classes
            
            print(class_names)
            print(f'Training images available: {len(train_data)}')
            print(f'Testing images available:  {len(test_data)}')
            
            
            
            
            i=1
            f=image#tamaño original de la imagen
            ###Calcular la dimension del output RELU
            for i in range(1, convo+1):
                f=(((f-kernel)+(2*paddingg))/stridee)+1
                f=f/2
            f=int(f)
            print (f)
            
            if ( convo == 2 and (kernel==3 or kernel==5) and (biass==False or biass ==True)):
                class ConvolutionalNetwork(nn.Module):
                    def __init__(self):
                        super().__init__()
                        self.conv1 = nn.Conv2d(tipo_image, 6, kernel_size=(kernel,kernel), stride=stridee,padding=paddingg,bias=biass)
                        self.conv2 = nn.Conv2d(6, 16, kernel_size=(kernel,kernel), stride=stridee,padding=paddingg,bias=biass)
            
                    
                        self.fc1 = nn.Linear(f*f*16, 120)
                        self.fc2 = nn.Linear(120, 84)
                        self.fc3 = nn.Linear(84, 18)#este 18 son las 18 clases
            
                    def forward(self, X):
                        X = F.relu(self.conv1(X))
                        X = F.max_pool2d(X, 2, 2)#max pooling de 2x2
                        X = F.relu(self.conv2(X))
                        X = F.max_pool2d(X, 2, 2)
                        X = X.view(-1, f*f*16)
                        X = F.relu(self.fc1(X))
                        X = F.relu(self.fc2(X))
                        X = self.fc3(X)
                        return F.log_softmax(X, dim=1)
                torch.manual_seed(101)
                CNNmodel = ConvolutionalNetwork()
                CNNmodel=CNNmodel.share_memory()
                criterion = nn.CrossEntropyLoss()
                optimizer = torch.optim.Adam(CNNmodel.parameters(), lr=0.001)
                print(CNNmodel)
            
                import time
                start_time = time.time()
                
                epochs = 10
                
                max_trn_batch = 800
                max_tst_batch = 300
                
                train_losses = []
                test_losses = []
                train_correct = []
                test_correct = []
                
                for i in range(epochs):
                    trn_corr = 0
                    tst_corr = 0
                    
                    # Run the training batches
                    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
                    for b, (X_train, y_train) in enumerate(train_loader):
                        
                        # Limit the number of batches
                        if b == max_trn_batch:
                            break
                        b+=1
                        X_train = X_train.to(device)
                        y_train = y_train.to(device)
                        
                        # Apply the model
                        y_pred = CNNmodel(X_train)
                        loss = criterion(y_pred, y_train)
                 
                        # Tally the number of correct predictions
                        predicted = torch.max(y_pred.data, 1)[1]
                        batch_corr = (predicted == y_train).sum()
                        trn_corr += batch_corr
                        
                        # Update parameters
                        optimizer.zero_grad()
                        loss.backward()
                        optimizer.step()
                
                        # Print interim results
                        if b%200 == 0:
                            print(f'epoch: {i:2}  batch: {b:4} [{10*b:6}/8000]  loss: {loss.item():10.8f}  \
                                  accuracy: {trn_corr.item()*100/(10*b):7.3f}%')
                
                    train_losses.append(loss)
                    train_correct.append(trn_corr)
                
                    # Run the testing batches
                    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
                    with torch.no_grad():
                        for b, (X_test, y_test) in enumerate(test_loader):
                            X_test = X_test.to(device)
                            y_test = y_test.to(device)
                            # Limit the number of batches
                            if b == max_tst_batch:
                                break
                
                            # Apply the model
                            y_val = CNNmodel(X_test)
                
                            # Tally the number of correct predictions
                            predicted = torch.max(y_val.data, 1)[1] 
                            tst_corr += (predicted == y_test).sum()
                
                    loss = criterion(y_val, y_test)
                    test_losses.append(loss)
                    test_correct.append(tst_corr)
                
                torch.save(CNNmodel.state_dict(), 'quinua_pytorch.pt')
                print(test_correct)
                 
                ############## confusion matrix##################
                device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
                # Initialize the prediction and label lists(tensors)
                predlist=torch.zeros(0,dtype=torch.long, device='cpu')
                lbllist=torch.zeros(0,dtype=torch.long, device='cpu')
            
                with torch.no_grad():
                    for i, (inputs, classes) in enumerate(test_loader):
                        inputs = inputs.to(device)
                        classes = classes.to(device)
                        outputs = CNNmodel(inputs)
                        _, preds = torch.max(outputs, 1)
                        
                        # Append batch prediction results
                        predlist=torch.cat([predlist,preds.view(-1).cpu()])
                        lbllist=torch.cat([lbllist,classes.view(-1).cpu()])
            
                
                ####create folder donde estaran las figuras
                path = "corridas/prueba"+str(cor-1)
                try:
                    os.mkdir(path)
                except OSError:
                    print ("Creation of the directory %s failed" % path)
                else:
                    print ("Successfully created the directory %s" % path)   
                    
                #graphics matriz confusion
                arr = confusion_matrix(lbllist.numpy(), predlist.numpy())
                df_cm = pd.DataFrame(arr, class_names, class_names)
                plt.figure(figsize = (9,6))
                sn.heatmap(df_cm, annot=True, fmt="d", cmap='BuGn')
                plt.xlabel("prediction")
                plt.ylabel("label (ground truth)")
                plt.savefig(path+'/matrix.png')   # save the figure to file
                plt.show();
                                 
                ###graphics losss accuracy
                plt.plot(train_losses, label='training loss')
                plt.plot(test_losses, label='validation loss')
                plt.title('Loss at the end of each epoch')
                plt.legend();
                plt.savefig(path+'/loss.png')   # save the figure to file
                plt.show()
                
                ###graphics  accuracy
                plt.plot([t/80 for t in train_correct], label='training accuracy')
                plt.plot([t/30 for t in test_correct], label='validation accuracy')
                plt.title('Accuracy at the end of each epoch')
                plt.legend();
                plt.savefig(path+'/acc.png')   # save the figure to file
                plt.show()
                
                
        
                print('\x1b[1;30;43m'+".......TODOS LOS VALORES RESUMIDOS....."+ '\x1b[0m')
                from sklearn.metrics import  f1_score, precision_score, recall_score
                print(f'Test accuracy: {test_correct[-1].item()*100/3000:.2f}%')
                print(f'PRC:{precision_score(lbllist.numpy(), predlist.numpy(), average="weighted")*100:.4f}%')
                print(f'recall:{recall_score(lbllist.numpy(), predlist.numpy(), average="weighted")*100:.4f}%') 
                print(f'f1_score:{f1_score(lbllist.numpy(), predlist.numpy(), average="weighted")*100:.4f}%')
                print(f'\nDuration: {time.time() - start_time:.0f} seconds') # print the time elapsed
                
                acc.append(round(test_correct[-1].item()*100/3000,2))
                prc.append(round(precision_score(lbllist.numpy(), predlist.numpy(), average="weighted")*100,2))
                recall.append(round(recall_score(lbllist.numpy(), predlist.numpy(), average="weighted")*100,2))	
                f1.append(round(f1_score(lbllist.numpy(), predlist.numpy(), average="weighted")*100,2))	
                times.append(int(time.time() - start_time))
                
                if count<=100:
                    excel_sheet = excel_file['acc']
                    excel_sheet.cell(row=mm, column=count).value = round(test_correct[-1].item()*100/3000,2)
                    excel_sheet = excel_file['prc']
                    excel_sheet.cell(row=mm, column=count).value =round(precision_score(lbllist.numpy(), predlist.numpy(), average="weighted")*100,2)
                    excel_sheet = excel_file['recall']
                    excel_sheet.cell(row=mm, column=count).value =round(recall_score(lbllist.numpy(), predlist.numpy(), average="weighted")*100,2)
                    excel_sheet = excel_file['f1']
                    excel_sheet.cell(row=mm, column=count).value =round(f1_score(lbllist.numpy(), predlist.numpy(), average="weighted")*100,2)
                    excel_sheet = excel_file['times']
                    excel_sheet.cell(row=mm, column=count).value =int(time.time() - start_time)
                   
                    count=count+1
        
                else: 
                    count=2
                    mm=mm+1  
                    
    
                excel_file.save('corridas/corridas.xlsx')

ptrblck · May 1, 2020, 12:24am

Maybe you are missing the if-clause protection as described in the Windows FAQ.
Could you add it and rerun your code?

jasg · May 1, 2020, 12:26am

i put if name == ‘main’:
but the execution never enters this if