This is my model, I apologize for the long line of code, I have no idea how to convey my question otherwise. Can anybody tell me why the output from epoch 10 - 12 so weird?
import os
import tarfile
import torch
from torch.utils.data import random_split
import torchvision.transforms as tt
from torchvision.transforms import Compose
from torchvision.datasets.utils import download_url
from torchvision.datasets import ImageFolder
import shutil
import glob
from random import sample
from torch.utils.data import DataLoader
download_url(url = "https://s3.amazonaws.com/fast-ai-imageclas/cifar10.tgz", root = ".")
file = tarfile.open("/content/cifar10.tgz", mode = "r")
file.extractall("./data")
data_training = "/content/data/cifar10/train"
if os.path.exists("/content/data/cifar10/validate") is False:
os.makedirs("/content/data/cifar10/validate")
os.makedirs("/content/data/cifar10/validate/airplane")
os.makedirs("/content/data/cifar10/validate/automobile")
os.makedirs("/content/data/cifar10/validate/bird")
os.makedirs("/content/data/cifar10/validate/cat")
os.makedirs("/content/data/cifar10/validate/deer")
os.makedirs("/content/data/cifar10/validate/dog")
os.makedirs("/content/data/cifar10/validate/frog")
os.makedirs("/content/data/cifar10/validate/horse")
os.makedirs("/content/data/cifar10/validate/ship")
os.makedirs("/content/data/cifar10/validate/truck")
for i in sample(glob.glob("/content/data/cifar10/train/airplane/*.png"),500):
shutil.move(i,"/content/data/cifar10/validate/airplane")
for i in sample(glob.glob("/content/data/cifar10/train/automobile/*.png"),500):
shutil.move(i,"/content/data/cifar10/validate/automobile")
for i in sample(glob.glob("/content/data/cifar10/train/bird/*.png"),500):
shutil.move(i,"/content/data/cifar10/validate/bird")
for i in sample(glob.glob("/content/data/cifar10/train/cat/*.png"),500):
shutil.move(i,"/content/data/cifar10/validate/cat")
for i in sample(glob.glob("/content/data/cifar10/train/deer/*.png"),500):
shutil.move(i,"/content/data/cifar10/validate/deer")
for i in sample(glob.glob("/content/data/cifar10/train/dog/*.png"),500):
shutil.move(i,"/content/data/cifar10/validate/dog")
for i in sample(glob.glob("/content/data/cifar10/train/frog/*.png"),500):
shutil.move(i,"/content/data/cifar10/validate/frog")
for i in sample(glob.glob("/content/data/cifar10/train/horse/*.png"),500):
shutil.move(i,"/content/data/cifar10/validate/horse")
for i in sample(glob.glob("/content/data/cifar10/train/ship/*.png"),500):
shutil.move(i,"/content/data/cifar10/validate/ship")
for i in sample(glob.glob("/content/data/cifar10/train/truck/*.png"),500):
shutil.move(i,"/content/data/cifar10/validate/truck")
train_data_transformation = tt.Compose([
tt.RandomCrop(32, padding=6, padding_mode="reflect"),
tt.RandomPerspective(distortion_scale=0.5, p=0.5),
tt.ToTensor(),
tt.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010), inplace=True)
])
val_data_transformation = tt.Compose([
tt.ToTensor(),
tt.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010), inplace=True)
])
train_ds = ImageFolder(root = "/content/data/cifar10/train", transform = train_data_transformation)
val_ds = ImageFolder(root = "/content/data/cifar10/validate", transform = val_data_transformation)
train_set = DataLoader(train_ds, batch_size = 128, shuffle =True, num_workers = 4, pin_memory = True)
val_set = DataLoader(val_ds,batch_size= 128, num_workers = 4, pin_memory = True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
def accuracy(outputs, labels):
_, preds = torch.max(outputs, dim=1)
return torch.tensor(torch.sum(preds == labels).item() / len(preds))
def training_step(train_set, model):
images, labels = train_set
images,labels = images.to(device), labels.to(device)
output = model(images)
loss = F.cross_entropy(output, labels)
return loss
def validation_loss(val_inputs, model):
model.eval()
with torch.no_grad():
images, labels = val_inputs
images,labels = images.to(device), labels.to(device)
output = model(images)
loss = F.cross_entropy(output, labels)
acc = accuracy(output, labels)
return {"loss": loss.detach(), "Accuracy": acc}
def validation_combine_loss(outputs, model):
loss_accuracy= [validation_loss(batch,model) for batch in outputs]
extract_loss = [x["loss"] for x in loss_accuracy]
combining_loss = torch.stack(extract_loss).mean()
extract_accuracy = [x["Accuracy"] for x in loss_accuracy]
combining_Accuracy = torch.stack(extract_accuracy).mean()
return{"Loss":combining_loss.item(), "Accuracy":combining_Accuracy.item()}
def epoch_end(result,epoch):
print("epoch: {}, last_lr {}, Epoch_loss:{}, Epoch_accuracy {}, train_loss {}" .format(epoch, result["lrs"][-1], result["Loss"], result["Accuracy"], result["train loss"] ))
def conv_block(in_channels, out_channels, pool=False):
layers = [nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True)]
if pool: layers.append(nn.MaxPool2d(2))
return nn.Sequential(*layers)
class ResNet9(nn.Module):
def __init__(self, in_channels, num_classes):
super().__init__()
self.conv1 = conv_block(in_channels, 64)
self.conv2 = conv_block(64, 128, pool=True)
self.res1 = nn.Sequential(conv_block(128, 128), conv_block(128, 128))
self.conv3 = conv_block(128, 256, pool=True)
self.conv4 = conv_block(256, 512, pool=True)
self.res2 = nn.Sequential(conv_block(512, 512), conv_block(512, 512))
self.classifier = nn.Sequential(nn.MaxPool2d(4),
nn.Flatten(),
nn.Dropout(0.2),
nn.Linear(512, num_classes))
def forward(self, xb):
out = self.conv1(xb)
out = self.conv2(out)
out = self.res1(out) + out
out = self.conv3(out)
out = self.conv4(out)
out = self.res2(out) + out
out = self.classifier(out)
return out
model = ResNet9(3,10)
model = model.to(device)
def fit(epochs,train_set,val_dl,model,lr): #model trainer
optimizer = torch.optim.Adam(model.parameters(), lr, weight_decay = 0.001) #defining the optimizer
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, lr, epochs = epochs,steps_per_epoch = len(train_set))# learning rate scheduler
def get_lr():
for param_group in optimizer.param_groups: #getting the learning rates of e
return param_group["lr"]
history = []
for epoch in range(epochs):
model.train()
train_loss = []
lrs = []
for batch in train_set:
loss = training_step(batch, model)
train_loss.append(loss)
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), 0.1)
optimizer.step()
optimizer.zero_grad()
lrs.append(get_lr())
scheduler.step()
#validation
results = validation_combine_loss(val_dl,model)
results["lrs"] = lrs
results["train loss"] = torch.stack(train_loss).mean().item()
epoch_end(results,epoch)
history.append(results)
return history
fit(20,train_set,val_set,model,0.01) #training model
My output is as follows, please take a look at epoch 10-12 and 20 as well. I have taken out outputs for epochs 0-9 and 13-19 to keep the code short.
type or paste code hereException ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fe28fbe4f80>
Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1328, in __del__
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fe28fbe4f80>
self._shutdown_workers()
Traceback (most recent call last):
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fe28fbe4f80>
Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1328, in __del__
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
if w.is_alive():
File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1328, in __del__
assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
self._shutdown_workers()
self._shutdown_workers()
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
if w.is_alive():
if w.is_alive():
File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fe28fbe4f80>
File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
Traceback (most recent call last):
assert self._parent_pid == os.getpid(), 'can only test a child process'
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1328, in __del__
assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
AssertionError: can only test a child process
self._shutdown_workers()
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
if w.is_alive():
File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
epoch: 10, last_lr 0.007169430017913008, Epoch_loss:1.6145600080490112, Epoch_accuracy 0.41132813692092896, train_loss 1.6661852598190308
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fe28fbe4f80>
Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1328, in __del__
self._shutdown_workers()
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
if w.is_alive():
File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
epoch: 11, last_lr 0.0061126202193628925, Epoch_loss:1.6688216924667358, Epoch_accuracy 0.36210936307907104, train_loss 1.64984130859375
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fe28fbe4f80>
Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1328, in __del__
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fe28fbe4f80>
self._shutdown_workers()
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fe28fbe4f80>
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
if w.is_alive():
Traceback (most recent call last):
Traceback (most recent call last):
File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1328, in __del__
assert self._parent_pid == os.getpid(), 'can only test a child process'
self._shutdown_workers()
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1328, in __del__
AssertionError: can only test a child process
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
if w.is_alive():
File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fe28fbe4f80>
assert self._parent_pid == os.getpid(), 'can only test a child process'
self._shutdown_workers()
Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
AssertionError: can only test a child process
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1328, in __del__
if w.is_alive():
File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
self._shutdown_workers()
assert self._parent_pid == os.getpid(), 'can only test a child process'
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
AssertionError: can only test a child process
if w.is_alive():
File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
epoch: 12, last_lr 0.00500002, Epoch_loss:1.628069519996643, Epoch_accuracy 0.39375001192092896, train_loss 1.6207380294799805
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fe28fbe4f80>
Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1328, in __del__
self._shutdown_workers()
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
if w.is_alive():
File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fe28fbe4f80>
Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1328, in __del__
self._shutdown_workers()
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
if w.is_alive():
File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fe28fbe4f80>
assert self._parent_pid == os.getpid(), 'can only test a child process'
Traceback (most recent call last):
AssertionError: can only test a child process
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1328, in __del__
self._shutdown_workers()
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
if w.is_alive():
File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fe28fbe4f80>
assert self._parent_pid == os.getpid(), 'can only test a child process'
Traceback (most recent call last):
AssertionError: can only test a child process
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1328, in __del__
self._shutdown_workers()
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
if w.is_alive():
File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
[{'Accuracy': 0.3980468809604645,
'Loss': 1.6618016958236694,
'lrs': [0.0003999999999999993,