i am trying to train resnet50 from scratch for 100 epochs. using sgd(lr=0.1,weightdecay=0.0001,momentum=0.9) as optimizer and scheduler to decay lr by factor of 10 after every 30 epochs. batch size is 128.
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import transforms as transforms
import torch.utils.data
transform = transforms.Compose([transforms.Resize(224), transforms.ToTensor()])
train_fmnist_dataset = torchvision.datasets.FashionMNIST(root=β./β, train=True, transform=transform,download=True)
test_fmnist_dataset = torchvision.datasets.FashionMNIST(root=β./β, train=False, transform=transform,download=True)
train_data_loader = torch.utils.data.DataLoader(train_fmnist_dataset, batch_size=128, shuffle=True,num_workers=2,pin_memory=True)
test_data_loader = torch.utils.data.DataLoader(test_fmnist_dataset, batch_size=128, shuffle=False,num_workers=2,pin_memory=True)
from tqdm import tqdm
def train_epoch(model, criterion, optimizer, scheduler, train_loader, epoch, device):
loss_train = 0.
acc_train = 0.
ntrain = len(train_loader.dataset)
with tqdm(train_loader, unit=βbatchesβ) as pbar:
for i, (images, labels) in enumerate(pbar):
pbar.set_description(f"Epoch {epoch}")
images = images.to(device)
labels = labels.to(device).to(torch.long).view(-1)
# train and back-propagate
optimizer.zero_grad()
output = model(images)
loss = criterion(output, labels)
loss.backward()
optimizer.step()
scheduler.step()
# train accuracy
with torch.no_grad():
probs = F.softmax(output, dim=1).cpu()
preds = torch.max(probs, dim=1)[1]
acc_train += (preds == labels.cpu()).sum()
loss_train += loss.item()
if i < len(train_loader) - 1:
denom = images.shape[0] * (i+1)
else:
denom = ntrain
pbar.set_postfix(train_loss=loss_train/denom, train_acc=acc_train/denom)
def validate(model, criterion, val_loader, device):
acc_val = 0.
loss_val = 0.
nval = len(val_loader.dataset)
with torch.no_grad():
for val_images, val_labels in val_loader:
val_images = val_images.to(device)
val_labels = val_labels.to(device).to(torch.long).view(-1)
val_output = model(val_images)
val_loss = criterion(val_output, val_labels)
loss_val += val_loss.item()
probs = F.softmax(val_output, dim=1).cpu()
preds = torch.max(probs, dim=1)[1]
acc_val += (preds == val_labels.cpu()).sum()
return loss_val/nval, acc_val/nval
def train(model, criterion, optimizer, scheduler, train_loader, device):
for epoch in range(100):
train_epoch(model, criterion, optimizer, scheduler, train_loader, epoch, device)
from torchvision.models import resnet50
m1=resnet50()
m1.conv1=nn.Conv2d(1,64,kernel_size=7,stride=2,padding=3,bias=False)
for module in m1.modules():
if isinstance(module, nn.BatchNorm2d):
module.momentum=0.9
m1.fc=nn.Linear(in_features=2048,out_features=10)
m1.layer2[0].conv1.stride,m1.layer2[0].conv2.stride=2,1
m1.layer3[0].conv1.stride,m1.layer3[0].conv2.stride=2,1
m1.layer4[0].conv1.stride,m1.layer4[0].conv2.stride=2,1
import torch.optim as optim
from torch.optim import lr_scheduler
criterion = nn.CrossEntropyLoss(reduction=βsumβ)
optimizer = optim.SGD(params=m1.parameters(), lr=0.1,momentum=0.9,weight_decay=0.0001)
scheduler=lr_scheduler.StepLR(optimizer,step_size=30,gamma=0.1)
device = βcudaβ if torch.cuda.is_available() else βcpuβ
m1=m1.to(device)
criterion=criterion.to(device)
m1.train()
train(m1, criterion, optimizer, scheduler, train_data_loader, device)
torch.save(m1.state_dict(), β./m1β)
m1.eval()
e,a1=validate(m1,criterion,test_data_loader,device)
print(a1)
Epoch 0: 100%|ββββββββββ| 469/469 [04:13<00:00, 1.85batches/s, train_acc=tensor(0.1012), train_loss=31.4]
Epoch 1: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1010), train_loss=2.44]
Epoch 2: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1015), train_loss=2.45]
Epoch 3: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1015), train_loss=2.47]
Epoch 4: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1019), train_loss=2.46]
Epoch 5: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1015), train_loss=2.46]
Epoch 6: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1012), train_loss=2.49]
Epoch 7: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1014), train_loss=2.48]
Epoch 8: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1018), train_loss=2.45]
Epoch 9: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1017), train_loss=2.44]
Epoch 10: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1014), train_loss=2.48]
Epoch 11: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1014), train_loss=2.45]
Epoch 12: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1015), train_loss=2.47]
Epoch 13: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1015), train_loss=2.46]
Epoch 14: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1016), train_loss=2.44]
Epoch 15: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1013), train_loss=2.44]
Epoch 16: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1013), train_loss=2.44]
Epoch 17: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1014), train_loss=2.45]
Epoch 18: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1017), train_loss=2.44]
Epoch 19: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1016), train_loss=2.45]
Epoch 20: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1018), train_loss=2.44]
Epoch 21: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1017), train_loss=2.45]
Epoch 22: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1014), train_loss=2.45]
Epoch 23: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1010), train_loss=2.45]
Epoch 24: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1016), train_loss=2.44]
Epoch 25: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1017), train_loss=2.44]
Epoch 26: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1014), train_loss=2.45]
Epoch 27: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1013), train_loss=2.46]
Epoch 28: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1015), train_loss=2.46]
Epoch 29: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1015), train_loss=2.43]
Epoch 30: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1016), train_loss=2.45]
Epoch 31: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1016), train_loss=2.44]
Epoch 32: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1015), train_loss=2.45]
Epoch 33: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1012), train_loss=2.46]
Epoch 34: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1012), train_loss=2.44]
Epoch 35: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1017), train_loss=2.45]
Epoch 36: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1016), train_loss=2.48]
Epoch 37: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1013), train_loss=2.45]
Epoch 38: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1016), train_loss=2.45]
Epoch 39: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1017), train_loss=2.45]
Epoch 40: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1015), train_loss=2.47]
Epoch 41: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1017), train_loss=2.45]
Epoch 42: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1017), train_loss=2.46]
Epoch 43: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1015), train_loss=2.45]
Epoch 44: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1018), train_loss=2.44]
Epoch 45: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1018), train_loss=2.46]
Epoch 46: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1017), train_loss=2.46]
Epoch 47: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1013), train_loss=2.45]
Epoch 48: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1018), train_loss=2.46]
Epoch 49: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1016), train_loss=2.45]
Epoch 50: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1012), train_loss=2.44]
Epoch 51: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1014), train_loss=2.45]
Epoch 52: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1014), train_loss=2.42]
Epoch 53: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1018), train_loss=2.47]
Epoch 54: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1015), train_loss=2.45]
Epoch 55: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1014), train_loss=2.45]
Epoch 56: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1015), train_loss=2.47]
Epoch 57: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1016), train_loss=2.47]
Epoch 58: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1016), train_loss=2.43]
Epoch 59: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1018), train_loss=2.44]
Epoch 60: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1015), train_loss=2.45]
Epoch 61: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1015), train_loss=2.46]
Epoch 62: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1014), train_loss=2.44]
Epoch 63: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1015), train_loss=2.44]
Epoch 64: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1014), train_loss=2.44]
Epoch 65: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1012), train_loss=2.46]
Epoch 66: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1014), train_loss=2.43]
Epoch 67: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1014), train_loss=2.46]
Epoch 68: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1018), train_loss=2.45]
Epoch 69: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1012), train_loss=2.45]
Epoch 70: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1011), train_loss=2.45]
Epoch 71: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1015), train_loss=2.47]
Epoch 72: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1014), train_loss=2.45]
Epoch 73: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1013), train_loss=2.44]
Epoch 74: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.90batches/s, train_acc=tensor(0.1013), train_loss=2.45]
Epoch 75: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1016), train_loss=2.45]
Epoch 76: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1015), train_loss=2.44]
Epoch 77: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1011), train_loss=2.46]
Epoch 78: 100%|ββββββββββ| 469/469 [04:07<00:00, 1.89batches/s, train_acc=tensor(0.1011), train_loss=2.44]
Epoch 79: 31%|βββ | 145/469 [01:17<02:52, 1.87batches/s, train_acc=tensor(0.1029), train_loss=2.41]
but model not getting trained even after 79 epochs.
i am running this on kaggleβs p100 gpu.