Broken pipes and no learning

fsz · February 16, 2021, 11:48am

Hi,
i am trying to implement a ResNet to learn on a subset of ImageNet (100 classes 500 images each).
To test my training i tried it on this ResNet architecture:
link
I have 2 questions regarding my training:

I always get a broken pipe or DefaultCPUAllocator error after 10 epochs, why is that and what can i do to avoid it? I assume broken pipe is just the CPU error on a different thread. Is this just not possible to train this on CPU?
Even in the first 10 epochs (without the error) it does not get better and will stay at ~0.45 no matter the learn rate.

import torch
import torchvision
import torchvision.transforms as transforms


def classGetter():
    folder = "C:/Users/Felix/Desktop/python/pytorchTest/resNetTry/dl4cv_dataset/classes.txt"
    file = open(folder, "r")
    f = []
    for x in file:
        f.append(x.replace("\n", ""))
    file.close()
    return f


def imshow(img):
    img = img / 2 + 0.5  # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()


if __name__ == '__main__':
    imageSize = 80  # dataset is 100xpx
    # too large -> DefaultCPUAllocator: not enough memory: you tried to allocate 20070400 bytes. Buy new RAM!
    # 80x80 -> BrokenPipeError: [Errno 32] Broken pipe is prob CPU problem of other thread
    learnRate = 0.2
    nrOfEpochs = 5
    use = "example Resnet"
    # use = "comp"  # which net is to be used? the resnet attempt or the compare network?
    # use = "res"

    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Resize(imageSize),
         transforms.RandomCrop(imageSize),
         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

    train = torchvision.datasets.ImageFolder(
        root="C:/Users/Felix/Desktop/python/pytorchTest/resNetTry/dl4cv_dataset/train/",
        transform=transform)
    valid = torchvision.datasets.ImageFolder(
        root="C:/Users/Felix/Desktop/python/pytorchTest/resNetTry/dl4cv_dataset/val/",
        transform=transform)
    # num_workers uses spawn()/fork() to make dataloader new process
    trainloader = torch.utils.data.DataLoader(train,
                                              batch_size=10,
                                              shuffle=True,
                                              num_workers=2)
    valLoader = torch.utils.data.DataLoader(valid,
                                            batch_size=100,
                                            shuffle=True,
                                            num_workers=2)

    # print(train[0])
    print(len(train[0][0]).__str__() + " channels")
    print(len(train[0][0][0]).__str__() + "," + len(train[0][0][0][0]).__str__() + " image size")

    classes = classGetter()
    # print(classes)

    import matplotlib.pyplot as plt
    import numpy as np

    # get some random training images
    dataiter = iter(trainloader)
    images, labels = dataiter.next()

    # title_obj = plt.title('please check if correct labeling')
    # plt.setp(title_obj, color='r')
    # print labels
    # print(' '.join('%5s' % classes[labels[j]] for j in range(10)))
    # show images
    # imshow(torchvision.utils.make_grid(images))

    import compareNonDeepNet as comNet
    import ResNet
    import ResNetExample as rsEX
    import torch.nn as nn

    if use == "res":
        net = ResNet.Net()
    else:
        if use == "comp":
            net = comNet.Net()
        else:
            # print(len(classes))
            net = rsEX.ResNet50(img_channel=3, num_classes=len(classes))

    # test if NN forward works
    print("--------------------------")
    # random_data = torch.rand((1, len(train[0][0]), len(train[0][0][0]), len(train[0][0][0][0])))
    # result = net(random_data)
    # print(result)
    print("--------------------------")

    import torch.optim as optim

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=learnRate, momentum=0.9)

    lossArray = []
    valLossArray = []
    fig, ax = plt.subplots()
    ax.set(xlabel='epoch', ylabel='loss',
           title='Error on ' + use)
    ax.grid()

    for epoch in range(nrOfEpochs):  # loop over the dataset multiple times
        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            # print(labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 200 == 199:  # print every 200 mini-batches
                valInp, valLabel = next(iter(valLoader))
                out = net(valInp)  # [:50]
                valLoss = criterion(out, valLabel)  # [:50]

                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 200))

                valLossArray.append(valLoss)
                lossArray.append(running_loss / 200)
                ax.plot(lossArray, 'C1', label="train")
                ax.plot(valLossArray, 'C2', label="val")

                # plt.legend() # shows labels and color
                # plt.show(block=False)
                plt.draw()
                plt.pause(0.001)
                # plt.show()

                running_loss = 0.0

    print('Finished Training')

    valInp, valLabel = next(iter(valLoader))
    out = net(valInp)
    valLoss = criterion(out, valLabel)
    print(str(valLoss) + " after " + str(nrOfEpochs) + " epochs")

    PATH = './resTry.pth'
    torch.save(net.state_dict(), PATH)

    ax.plot(lossArray, 'C1', label="train")
    ax.plot(valLossArray, 'C2', label="val")
    plt.show()

ptrblck · February 17, 2021, 7:13am

For the first issue: set num_workers=0 and check, if you get a better error message.
The broken pipe error is raised, e.g. if a worker process encountered an error and crashed. The complete error message might give you an idea why this process failed.

fsz · February 17, 2021, 1:44pm

Yes, it was a memory Error in another thread. The error came on line 129:
out = net(valInp) # [:50]

RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:73] data. DefaultCPUAllocator: not enough memory: you tried to allocate 40960000 bytes. Buy new RAM!

with lower batch sizes it didnt crash, but i couldnt figure out what causes problem 2.