Recently I tried to experiment with a simple architecture originally coded in Caffe
in Pytorch
.
Here is the actual code for the architecture (The CIFAR10 Model ) and as you can see its pretty much VGG but smaller! nothing fancy here!
I first tried to replicate the results in Caffe
which I failed. Then I tried to train both architectures using normal SGD with Momentum
and without any data-augmentation/normalization to see if my implementation is correct in Pytorch or not. I trained both architectures in Caffe and Pytorch and in every single time, Caffe performed much better!
I could easily get 95%
accuracy in Caffe while in Pytorch I could hardly reach 94.70
+!
Caffe
takes a lot of memory as much as 5GB!, while Pytorch
takes as little as 2.3GB! I noticed Pytorch is way faster than Caffe and overall Pytorch performs much better in terms of memory management and training speed. However, I can not find any reason why I cant get the same results I get in Caffe?
Everything seems simple and trivial and I think I did everything right! yet the result in Pytorch is not even close!
Here is my implementation in Pytorch :
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
layer = {
'C': [3,3,1,1]
}
cfg = {
'simpnet': [['C',66], ['C',128], ['C',128], ['C',128], ['C',192], ['M'], ['C',192], ['C',192], ['C',192], ['C',192], ['C',288], ['M'], ['C', 288],['C',355], ['C',432]]
}
#class XPooling(nn.Module):
class simpnet(nn.Module):
def __init__(self, classes=10, simpnet_name='simpnet'):
super(simpnet, self).__init__()
#print(simpnet_name)
self.features = self._make_layers(cfg[simpnet_name])
self.classifier = nn.Linear(432, classes)
def load_my_state_dict(self, state_dict):
own_state = self.state_dict()
# print(own_state.keys())
# for name, val in own_state:
# print(name)
for name, param in state_dict.items():
name = name.replace('module.', '')
if name not in own_state:
# print(name)
continue
if isinstance(param, Parameter):
# backwards compatibility for serialized parameters
param = param.data
print("STATE_DICT: {}".format(name))
try:
own_state[name].copy_(param)
except:
print('While copying the parameter named {}, whose dimensions in the model are'
' {} and whose dimensions in the checkpoint are {}, ... Using Initial Params'.format(
name, own_state[name].size(), param.size()))
def forward(self, x):
#print(x.size())
out = self.features(x)
out = F.max_pool2d(out, kernel_size=out.size()[2:]) #Global Max Pooling
out = out.view(out.size(0), -1)
out = self.classifier(out)
return out
def _make_layers(self, cfg):
layers = []
in_channels = 3
for x in cfg:
if x[0] == 'M':
layers += [nn.MaxPool2d(kernel_size=2, stride=2),
nn.Dropout2d(0.2)]
else:
kernel_size = [layer[x[0]][0], layer[x[0]][1]]
stride = layer[x[0]][2]
padding = layer[x[0]][3]
layers += [nn.Conv2d(in_channels, x[1], kernel_size, padding=padding, stride=stride),
nn.BatchNorm2d(x[1]),
nn.ReLU(inplace=True),
nn.Dropout2d(0.2)]
in_channels = x[1]
return nn.Sequential(*layers)
# net = VGG('VGG11')
# x = torch.randn(2,3,32,32)
# print(net(Variable(x)).size())
And here is the training part :
from __future__ import division
import os, sys, pdb, shutil, time, random
import argparse
import torch
import torch.optim.lr_scheduler as lr_scheduler
import torch.backends.cudnn as cudnn
import torchvision.datasets as dset
import torchvision.transforms as transforms
from utils import AverageMeter, RecorderMeter, time_string, convert_secs2time
import models
model_names = sorted(name for name in models.__dict__
if name.islower() and not name.startswith("__")
and callable(models.__dict__[name]))
parser = argparse.ArgumentParser(description='Trains ResNeXt on CIFAR or ImageNet', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('data_path', type=str, help='Path to dataset')
parser.add_argument('--dataset', type=str, choices=['cifar10', 'cifar100', 'imagenet', 'svhn', 'stl10'], help='Choose between Cifar10/100 and ImageNet.')
parser.add_argument('--arch', metavar='ARCH', default='resnet18', choices=model_names, help='model architecture: ' + ' | '.join(model_names) + ' (default: resnext29_8_64)')
# Optimization options
parser.add_argument('--epochs', type=int, default=300, help='Number of epochs to train.')
parser.add_argument('--batch_size', type=int, default=100, help='Batch size.')
parser.add_argument('--learning_rate', type=float, default=0.1, help='The Learning Rate.')
parser.add_argument('--momentum', type=float, default=0.90, help='Momentum.')
parser.add_argument('--decay', type=float, default=0.005, help='Weight decay (L2 penalty).')
parser.add_argument('--schedule', type=int, nargs='+', default=[100, 190, 306, 390, 440, 540], help='Decrease learning rate at these epochs.')
parser.add_argument('--gammas', type=float, nargs='+', default=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1], help='LR is multiplied by gamma on schedule, number of gammas should be equal to schedule')
# Checkpoints
parser.add_argument('--print_freq', default=200, type=int, metavar='N', help='print frequency (default: 200)')
parser.add_argument('--save_path', type=str, default='./', help='Folder to save checkpoints and log.')
parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)')
parser.add_argument('--start_epoch', default=0, type=int, metavar='N', help='manual epoch number (useful on restarts)')
parser.add_argument('--evaluate', dest='evaluate', action='store_true', help='evaluate model on validation set')
# Acceleration
parser.add_argument('--ngpu', type=int, default=1, help='0 = CPU.')
parser.add_argument('--workers', type=int, default=2, help='number of data loading workers (default: 2)')
# random seed
parser.add_argument('--manualSeed', type=int, help='manual seed')
args = parser.parse_args()
args.use_cuda = args.ngpu>0 and torch.cuda.is_available()
if args.manualSeed is None:
args.manualSeed = random.randint(1, 10000)
random.seed(args.manualSeed)
torch.manual_seed(args.manualSeed)
torch.cuda.manual_seed(args.manualSeed)
if args.use_cuda:
torch.cuda.manual_seed_all(args.manualSeed)
#speeds things a bit more
cudnn.benchmark = True
# torch.backends.cudnn.enabled = False
# torch.backends.cudnn.deterministic = True
#asd
def main():
# Init logger
if not os.path.isdir(args.save_path):
os.makedirs(args.save_path)
log = open(os.path.join(args.save_path, 'log_seed_{}.txt'.format(args.manualSeed)), 'w')
print_log('save path : {}'.format(args.save_path), log)
state = {k: v for k, v in args._get_kwargs()}
print_log(state, log)
print_log("Random Seed: {}".format(args.manualSeed), log)
print_log("python version : {}".format(sys.version.replace('\n', ' ')), log)
print_log("torch version : {}".format(torch.__version__), log)
print_log("cudnn version : {}".format(torch.backends.cudnn.version()), log)
# Init dataset
if not os.path.isdir(args.data_path):
os.makedirs(args.data_path)
train_transform = transforms.Compose( [ transforms.RandomHorizontalFlip(), transforms.ToTensor()])
#[transforms.CenterCrop(32), transforms.ToTensor(),
# transforms.Normalize(mean, std)])
#)
test_transform = transforms.Compose([transforms.ToTensor()])
if args.dataset == 'cifar10':
train_data = dset.CIFAR10(args.data_path, train=True, transform=train_transform, download=True)
test_data = dset.CIFAR10(args.data_path, train=False, transform=test_transform, download=True)
num_classes = 10
elif args.dataset == 'cifar100':
train_data = dset.CIFAR100(args.data_path, train=True, transform=train_transform, download=True)
test_data = dset.CIFAR100(args.data_path, train=False, transform=test_transform, download=True)
num_classes = 100
elif args.dataset == 'svhn':
train_data = dset.SVHN(args.data_path, split='train', transform=train_transform, download=True)
test_data = dset.SVHN(args.data_path, split='test', transform=test_transform, download=True)
num_classes = 10
elif args.dataset == 'stl10':
train_data = dset.STL10(args.data_path, split='train', transform=train_transform, download=True)
test_data = dset.STL10(args.data_path, split='test', transform=test_transform, download=True)
num_classes = 10
elif args.dataset == 'imagenet':
assert False, 'Do not finish imagenet code'
else:
assert False, 'Do not support dataset : {}'.format(args.dataset)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True,
num_workers=args.workers, pin_memory=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=args.batch_size, shuffle=False,
num_workers=args.workers, pin_memory=True)
print_log("=> creating model '{}'".format(args.arch), log)
# Init model, criterion, and optimizer
net = models.__dict__[args.arch](num_classes)
#torch.save(net, 'net.pth')
#init_net = torch.load('net.pth')
#net.load_my_state_dict(init_net.state_dict())
print_log("=> network :\n {}".format(net), log)
net = torch.nn.DataParallel(net, device_ids=list(range(args.ngpu)))
# define loss function (criterion) and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr=0.01, momentum=0.9, weight_decay=0.005, nesterov=False)
# caffe iter: 50k,95k,153k,195k,220k
milestones = [100, 190, 306, 390, 440, 540]
scheduler = lr_scheduler.MultiStepLR(optimizer, milestones, gamma=0.1)
if args.use_cuda:
net.cuda()
criterion.cuda()
recorder = RecorderMeter(args.epochs)
# optionally resume from a checkpoint
if args.resume:
if os.path.isfile(args.resume):
print_log("=> loading checkpoint '{}'".format(args.resume), log)
checkpoint = torch.load(args.resume)
recorder = checkpoint['recorder']
args.start_epoch = checkpoint['epoch']
net.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])
print_log("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch']), log)
else:
print_log("=> no checkpoint found at '{}'".format(args.resume), log)
else:
print_log("=> do not use any checkpoint for {} model".format(args.arch), log)
if args.evaluate:
validate(test_loader, net, criterion, log)
return
# Main loop
start_time = time.time()
epoch_time = AverageMeter()
for epoch in range(args.start_epoch, args.epochs):
#current_learning_rate = adjust_learning_rate(optimizer, epoch, args.gammas, args.schedule)
current_learning_rate = float(scheduler.get_lr()[-1])
#print('lr:',current_learning_rate)
scheduler.step()
need_hour, need_mins, need_secs = convert_secs2time(epoch_time.avg * (args.epochs-epoch))
need_time = '[Need: {:02d}:{:02d}:{:02d}]'.format(need_hour, need_mins, need_secs)
print_log('\n==>>{:s} [Epoch={:03d}/{:03d}] {:s} [learning_rate={:.6f}]'.format(time_string(), epoch, args.epochs, need_time, current_learning_rate) \
+ ' [Best : Accuracy={:.2f}, Error={:.2f}]'.format(recorder.max_accuracy(False), 100-recorder.max_accuracy(False)), log)
# train for one epoch
train_acc, train_los = train(train_loader, net, criterion, optimizer, epoch, log)
# evaluate on validation set
#val_acc, val_los = extract_features(test_loader, net, criterion, log)
val_acc, val_los = validate(test_loader, net, criterion, log)
is_best = recorder.update(epoch, train_los, train_acc, val_los, val_acc)
if epoch == 180:
save_checkpoint({
'epoch': epoch ,
'arch': args.arch,
'state_dict': net.state_dict(),
'recorder': recorder,
'optimizer' : optimizer.state_dict(),
}, False, args.save_path, 'checkpoint_{0}.pth.tar'.format(epoch))
save_checkpoint({
'epoch': epoch + 1,
'arch': args.arch,
'state_dict': net.state_dict(),
'recorder': recorder,
'optimizer' : optimizer.state_dict(),
}, is_best, args.save_path, 'checkpoint.pth.tar')
# measure elapsed time
epoch_time.update(time.time() - start_time)
start_time = time.time()
recorder.plot_curve( os.path.join(args.save_path, 'training_plot_{0}.png'.format(args.manualSeed)) )
log.close()
# train function (forward, backward, update)
def train(train_loader, model, criterion, optimizer, epoch, log):
batch_time = AverageMeter()
data_time = AverageMeter()
losses = AverageMeter()
top1 = AverageMeter()
top5 = AverageMeter()
# switch to train mode
model.train()
end = time.time()
for i, (input, target) in enumerate(train_loader):
# measure data loading time
data_time.update(time.time() - end)
if args.use_cuda:
target = target.cuda(async=True)
input = input.cuda()
input_var = torch.autograd.Variable(input)
target_var = torch.autograd.Variable(target)
# compute output
output = model(input_var)
loss = criterion(output, target_var)
# measure accuracy and record loss
prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
losses.update(loss.data[0], input.size(0))
top1.update(prec1[0], input.size(0))
top5.update(prec5[0], input.size(0))
# compute gradient and do SGD step
optimizer.zero_grad()
loss.backward()
optimizer.step()
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
if i % args.print_freq == 0:
print_log(' Epoch: [{:03d}][{:03d}/{:03d}] '
'Time {batch_time.val:.3f} ({batch_time.avg:.3f}) '
'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
'Loss {loss.val:.4f} ({loss.avg:.4f}) '
'Prec@1 {top1.val:.3f} ({top1.avg:.3f}) '
'Prec@5 {top5.val:.3f} ({top5.avg:.3f}) '.format(
epoch, i, len(train_loader), batch_time=batch_time,
data_time=data_time, loss=losses, top1=top1, top5=top5) + time_string(), log)
print_log(' **Train** Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f} Error@1 {error1:.3f}'.format(top1=top1, top5=top5, error1=100-top1.avg), log)
return top1.avg, losses.avg
def validate(val_loader, model, criterion, log):
losses = AverageMeter()
top1 = AverageMeter()
top5 = AverageMeter()
# switch to evaluate mode
model.eval()
for i, (input, target) in enumerate(val_loader):
if args.use_cuda:
target = target.cuda(async=True)
input = input.cuda()
input_var = torch.autograd.Variable(input, volatile=True)
target_var = torch.autograd.Variable(target, volatile=True)
# compute output
output = model(input_var)
loss = criterion(output, target_var)
# measure accuracy and record loss
prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
losses.update(loss.data[0], input.size(0))
top1.update(prec1[0], input.size(0))
top5.update(prec5[0], input.size(0))
print_log(' **Test** Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f} Error@1 {error1:.3f}'.format(top1=top1, top5=top5, error1=100-top1.avg), log)
return top1.avg, losses.avg
def extract_features(val_loader, model, criterion, log):
losses = AverageMeter()
top1 = AverageMeter()
top5 = AverageMeter()
# switch to evaluate mode
model.eval()
for i, (input, target) in enumerate(val_loader):
if args.use_cuda:
target = target.cuda(async=True)
input = input.cuda()
input_var = torch.autograd.Variable(input, volatile=True)
target_var = torch.autograd.Variable(target, volatile=True)
# compute output
output, features = model([input_var])
pdb.set_trace()
loss = criterion(output, target_var)
# measure accuracy and record loss
prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
losses.update(loss.data[0], input.size(0))
top1.update(prec1[0], input.size(0))
top5.update(prec5[0], input.size(0))
print_log(' **Test** Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f} Error@1 {error1:.3f}'.format(top1=top1, top5=top5, error1=100-top1.avg), log)
return top1.avg, losses.avg
def print_log(print_string, log):
print("{}".format(print_string))
log.write('{}\n'.format(print_string))
log.flush()
def save_checkpoint(state, is_best, save_path, filename):
filename = os.path.join(save_path, filename)
torch.save(state, filename)
if is_best:
bestname = os.path.join(save_path, 'model_best.pth.tar')
shutil.copyfile(filename, bestname)
def adjust_learning_rate(optimizer, epoch, gammas, schedule):
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
lr = args.learning_rate
assert len(gammas) == len(schedule), "length of gammas and schedule should be equal"
for (gamma, step) in zip(gammas, schedule):
if (epoch >= step):
lr = lr * gamma
else:
break
for param_group in optimizer.param_groups:
param_group['lr'] = lr
return lr
def accuracy(output, target, topk=(1,)):
"""Computes the precision@k for the specified values of k"""
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].view(-1).float().sum(0)
res.append(correct_k.mul_(100.0 / batch_size))
return res
if __name__ == '__main__':
main()