Hi,
i am trying to implement a ResNet to learn on a subset of ImageNet (100 classes 500 images each).
To test my training i tried it on this ResNet architecture:
link
I have 2 questions regarding my training:
- I always get a broken pipe or DefaultCPUAllocator error after 10 epochs, why is that and what can i do to avoid it? I assume broken pipe is just the CPU error on a different thread. Is this just not possible to train this on CPU?
- Even in the first 10 epochs (without the error) it does not get better and will stay at ~0.45 no matter the learn rate.
import torch
import torchvision
import torchvision.transforms as transforms
def classGetter():
folder = "C:/Users/Felix/Desktop/python/pytorchTest/resNetTry/dl4cv_dataset/classes.txt"
file = open(folder, "r")
f = []
for x in file:
f.append(x.replace("\n", ""))
file.close()
return f
def imshow(img):
img = img / 2 + 0.5 # unnormalize
npimg = img.numpy()
plt.imshow(np.transpose(npimg, (1, 2, 0)))
plt.show()
if __name__ == '__main__':
imageSize = 80 # dataset is 100xpx
# too large -> DefaultCPUAllocator: not enough memory: you tried to allocate 20070400 bytes. Buy new RAM!
# 80x80 -> BrokenPipeError: [Errno 32] Broken pipe is prob CPU problem of other thread
learnRate = 0.2
nrOfEpochs = 5
use = "example Resnet"
# use = "comp" # which net is to be used? the resnet attempt or the compare network?
# use = "res"
transform = transforms.Compose(
[transforms.ToTensor(),
transforms.Resize(imageSize),
transforms.RandomCrop(imageSize),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
train = torchvision.datasets.ImageFolder(
root="C:/Users/Felix/Desktop/python/pytorchTest/resNetTry/dl4cv_dataset/train/",
transform=transform)
valid = torchvision.datasets.ImageFolder(
root="C:/Users/Felix/Desktop/python/pytorchTest/resNetTry/dl4cv_dataset/val/",
transform=transform)
# num_workers uses spawn()/fork() to make dataloader new process
trainloader = torch.utils.data.DataLoader(train,
batch_size=10,
shuffle=True,
num_workers=2)
valLoader = torch.utils.data.DataLoader(valid,
batch_size=100,
shuffle=True,
num_workers=2)
# print(train[0])
print(len(train[0][0]).__str__() + " channels")
print(len(train[0][0][0]).__str__() + "," + len(train[0][0][0][0]).__str__() + " image size")
classes = classGetter()
# print(classes)
import matplotlib.pyplot as plt
import numpy as np
# get some random training images
dataiter = iter(trainloader)
images, labels = dataiter.next()
# title_obj = plt.title('please check if correct labeling')
# plt.setp(title_obj, color='r')
# print labels
# print(' '.join('%5s' % classes[labels[j]] for j in range(10)))
# show images
# imshow(torchvision.utils.make_grid(images))
import compareNonDeepNet as comNet
import ResNet
import ResNetExample as rsEX
import torch.nn as nn
if use == "res":
net = ResNet.Net()
else:
if use == "comp":
net = comNet.Net()
else:
# print(len(classes))
net = rsEX.ResNet50(img_channel=3, num_classes=len(classes))
# test if NN forward works
print("--------------------------")
# random_data = torch.rand((1, len(train[0][0]), len(train[0][0][0]), len(train[0][0][0][0])))
# result = net(random_data)
# print(result)
print("--------------------------")
import torch.optim as optim
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=learnRate, momentum=0.9)
lossArray = []
valLossArray = []
fig, ax = plt.subplots()
ax.set(xlabel='epoch', ylabel='loss',
title='Error on ' + use)
ax.grid()
for epoch in range(nrOfEpochs): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
# print(labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if i % 200 == 199: # print every 200 mini-batches
valInp, valLabel = next(iter(valLoader))
out = net(valInp) # [:50]
valLoss = criterion(out, valLabel) # [:50]
print('[%d, %5d] loss: %.3f' %
(epoch + 1, i + 1, running_loss / 200))
valLossArray.append(valLoss)
lossArray.append(running_loss / 200)
ax.plot(lossArray, 'C1', label="train")
ax.plot(valLossArray, 'C2', label="val")
# plt.legend() # shows labels and color
# plt.show(block=False)
plt.draw()
plt.pause(0.001)
# plt.show()
running_loss = 0.0
print('Finished Training')
valInp, valLabel = next(iter(valLoader))
out = net(valInp)
valLoss = criterion(out, valLabel)
print(str(valLoss) + " after " + str(nrOfEpochs) + " epochs")
PATH = './resTry.pth'
torch.save(net.state_dict(), PATH)
ax.plot(lossArray, 'C1', label="train")
ax.plot(valLossArray, 'C2', label="val")
plt.show()