Broken pipes and no learning

Hi,
i am trying to implement a ResNet to learn on a subset of ImageNet (100 classes 500 images each).
To test my training i tried it on this ResNet architecture:
link
I have 2 questions regarding my training:

  1. I always get a broken pipe or DefaultCPUAllocator error after 10 epochs, why is that and what can i do to avoid it? I assume broken pipe is just the CPU error on a different thread. Is this just not possible to train this on CPU?
  2. Even in the first 10 epochs (without the error) it does not get better and will stay at ~0.45 no matter the learn rate.
import torch
import torchvision
import torchvision.transforms as transforms


def classGetter():
    folder = "C:/Users/Felix/Desktop/python/pytorchTest/resNetTry/dl4cv_dataset/classes.txt"
    file = open(folder, "r")
    f = []
    for x in file:
        f.append(x.replace("\n", ""))
    file.close()
    return f


def imshow(img):
    img = img / 2 + 0.5  # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()


if __name__ == '__main__':
    imageSize = 80  # dataset is 100xpx
    # too large -> DefaultCPUAllocator: not enough memory: you tried to allocate 20070400 bytes. Buy new RAM!
    # 80x80 -> BrokenPipeError: [Errno 32] Broken pipe is prob CPU problem of other thread
    learnRate = 0.2
    nrOfEpochs = 5
    use = "example Resnet"
    # use = "comp"  # which net is to be used? the resnet attempt or the compare network?
    # use = "res"

    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Resize(imageSize),
         transforms.RandomCrop(imageSize),
         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

    train = torchvision.datasets.ImageFolder(
        root="C:/Users/Felix/Desktop/python/pytorchTest/resNetTry/dl4cv_dataset/train/",
        transform=transform)
    valid = torchvision.datasets.ImageFolder(
        root="C:/Users/Felix/Desktop/python/pytorchTest/resNetTry/dl4cv_dataset/val/",
        transform=transform)
    # num_workers uses spawn()/fork() to make dataloader new process
    trainloader = torch.utils.data.DataLoader(train,
                                              batch_size=10,
                                              shuffle=True,
                                              num_workers=2)
    valLoader = torch.utils.data.DataLoader(valid,
                                            batch_size=100,
                                            shuffle=True,
                                            num_workers=2)

    # print(train[0])
    print(len(train[0][0]).__str__() + " channels")
    print(len(train[0][0][0]).__str__() + "," + len(train[0][0][0][0]).__str__() + " image size")

    classes = classGetter()
    # print(classes)

    import matplotlib.pyplot as plt
    import numpy as np

    # get some random training images
    dataiter = iter(trainloader)
    images, labels = dataiter.next()

    # title_obj = plt.title('please check if correct labeling')
    # plt.setp(title_obj, color='r')
    # print labels
    # print(' '.join('%5s' % classes[labels[j]] for j in range(10)))
    # show images
    # imshow(torchvision.utils.make_grid(images))

    import compareNonDeepNet as comNet
    import ResNet
    import ResNetExample as rsEX
    import torch.nn as nn

    if use == "res":
        net = ResNet.Net()
    else:
        if use == "comp":
            net = comNet.Net()
        else:
            # print(len(classes))
            net = rsEX.ResNet50(img_channel=3, num_classes=len(classes))

    # test if NN forward works
    print("--------------------------")
    # random_data = torch.rand((1, len(train[0][0]), len(train[0][0][0]), len(train[0][0][0][0])))
    # result = net(random_data)
    # print(result)
    print("--------------------------")

    import torch.optim as optim

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=learnRate, momentum=0.9)

    lossArray = []
    valLossArray = []
    fig, ax = plt.subplots()
    ax.set(xlabel='epoch', ylabel='loss',
           title='Error on ' + use)
    ax.grid()

    for epoch in range(nrOfEpochs):  # loop over the dataset multiple times
        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            # print(labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 200 == 199:  # print every 200 mini-batches
                valInp, valLabel = next(iter(valLoader))
                out = net(valInp)  # [:50]
                valLoss = criterion(out, valLabel)  # [:50]

                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 200))

                valLossArray.append(valLoss)
                lossArray.append(running_loss / 200)
                ax.plot(lossArray, 'C1', label="train")
                ax.plot(valLossArray, 'C2', label="val")

                # plt.legend() # shows labels and color
                # plt.show(block=False)
                plt.draw()
                plt.pause(0.001)
                # plt.show()

                running_loss = 0.0

    print('Finished Training')

    valInp, valLabel = next(iter(valLoader))
    out = net(valInp)
    valLoss = criterion(out, valLabel)
    print(str(valLoss) + " after " + str(nrOfEpochs) + " epochs")

    PATH = './resTry.pth'
    torch.save(net.state_dict(), PATH)

    ax.plot(lossArray, 'C1', label="train")
    ax.plot(valLossArray, 'C2', label="val")
    plt.show()


For the first issue: set num_workers=0 and check, if you get a better error message.
The broken pipe error is raised, e.g. if a worker process encountered an error and crashed. The complete error message might give you an idea why this process failed.

Yes, it was a memory Error in another thread. The error came on line 129:
out = net(valInp) # [:50]

RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:73] data. DefaultCPUAllocator: not enough memory: you tried to allocate 40960000 bytes. Buy new RAM!

with lower batch sizes it didnt crash, but i couldnt figure out what causes problem 2.