I am doing binary classification task for which I created Alexnet network and are using the weights of the pretrained alexnet model. Now instead of training whole network I have freezed all the layers except final layer for which I have kept the requires_grad as True. But now during training I observed that there is no major improvement in the run time of the model, still for 20 epochs it took almost 6 hours to training on 20000 images. I am not sure where I am going wrong. Please suggest me w.r.t below code if I going in right direction or there is some mistake in freezing layers or training.
alexnet_url = 'https://download.pytorch.org/models/alexnet-owt-4df8aa71.pth'
transform = transforms.Compose([transforms.RandomHorizontalFlip(),
transforms.Resize([256,256]),
transforms.ToTensor(),
transforms.Normalize(mean=[0.5],std=[0.5])
])
test_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=[0.5],std=[0.5])
])
dataset = CatDogDataset(root_dir = './Cat-Dog-data/cat-dog-train', transform = transform)
train_data, val_data = random_split(dataset, [19000, 1041])
train_loader = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_data, batch_size=64, shuffle=True)
test_data = CatDogDataset(root_dir = './Cat-Dog-data/cat-dog-test', transform = test_transform)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=64, shuffle=True)
now = datetime.now()
print('Before training:-', now)
# loading pre-trained alexnet model
model_pre = models.alexnet(pretrained=True)
# creating new alextnet model for binary classification
class AlexNet(nn.Module):
def __init__(self, num_classes=1):
super(AlexNet, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(64, 192, kernel_size=5, padding=2),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(192, 384, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(384, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
)
self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
self.classifier = nn.Sequential(
nn.Dropout(),
nn.Linear(256 * 6 * 6, 4096),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Linear(4096, num_classes),
nn.Sigmoid()
)
state = model_zoo.load_url(alexnet_url)
state_dict = self.state_dict()
for k, v in state.items():
if (k == 'classifier.6.weight' or k == 'classifier.6.bias'):
continue
state_dict.update({k: v})
self.load_state_dict(state_dict)
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
x = x.view(x.size(0), 256*6*6)
x = self.classifier(x)
return x
model = AlexNet(num_classes=1).to(dev)
model.features[0].weight.requires_grad = False
model.features[0].bias.requires_grad = False
model.features[3].weight.requires_grad = False
model.features[3].bias.requires_grad = False
model.features[6].weight.requires_grad = False
model.features[6].bias.requires_grad = False
model.features[8].weight.requires_grad = False
model.features[8].bias.requires_grad = False
model.features[10].weight.requires_grad = False
model.features[10].bias.requires_grad = False
model.classifier[1].weight.requires_grad = False
model.classifier[1].bias.requires_grad = False
model.classifier[4].weight.requires_grad = False
model.classifier[4].bias.requires_grad = False
model.classifier[6].weight.requires_grad = True
model.classifier[6].bias.requires_grad = True
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(filter(lambda p:p.requires_grad, model.parameters()), lr=0.0001, betas=(0.9, 0.999))
total_step = len(train_loader)
train_losses, val_losses, train_accs, val_accs = [],[],[],[]
for epoch in range(num_epochs):
train_loss = 0
val_loss = 0
train_acc = 0
val_acc = 0
for images,labels in train_loader:
images = images.to(dev)
labels = labels.to(dev)
outputs = model(images)
loss = criterion(outputs, labels.type(torch.FloatTensor).to(dev))
train_loss += loss.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
total = labels.size(0)
labels = torch.squeeze(labels)
predicted = np.where(outputs.cpu().detach().numpy()<0.5, 0, 1)
predicted = torch.from_numpy(predicted)
predicted = torch.squeeze(predicted)
correct = (predicted == labels.cpu()).sum().item()
train_acc += correct / total
Time observed before training:- 2019-09-26 10:38:14.336391
Time observed after training:- 2019-09-26 17:02:13.680405
Though the learnable parameters are 4097 still the model is taking almost 5 - 6 hrs for training.