Run time for pretrained network

I am doing binary classification task for which I created Alexnet network and are using the weights of the pretrained alexnet model. Now instead of training whole network I have freezed all the layers except final layer for which I have kept the requires_grad as True. But now during training I observed that there is no major improvement in the run time of the model, still for 20 epochs it took almost 6 hours to training on 20000 images. I am not sure where I am going wrong. Please suggest me w.r.t below code if I going in right direction or there is some mistake in freezing layers or training.

alexnet_url = 'https://download.pytorch.org/models/alexnet-owt-4df8aa71.pth'

transform = transforms.Compose([transforms.RandomHorizontalFlip(),
                                transforms.Resize([256,256]),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.5],std=[0.5])
                            ])

test_transform = transforms.Compose([
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.5],std=[0.5])
                            ])

dataset = CatDogDataset(root_dir = './Cat-Dog-data/cat-dog-train', transform = transform)
train_data, val_data = random_split(dataset, [19000, 1041])

train_loader = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True)

val_loader = torch.utils.data.DataLoader(val_data, batch_size=64, shuffle=True)
test_data = CatDogDataset(root_dir = './Cat-Dog-data/cat-dog-test', transform = test_transform)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=64, shuffle=True)


now = datetime.now()
print('Before training:-', now)

# loading pre-trained alexnet model
model_pre = models.alexnet(pretrained=True)

# creating new alextnet model for binary classification
class AlexNet(nn.Module):
    def __init__(self, num_classes=1):
        super(AlexNet, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes),
            nn.Sigmoid()
        )

        state = model_zoo.load_url(alexnet_url)
        state_dict = self.state_dict()
        for k, v in state.items():
            if (k == 'classifier.6.weight' or k == 'classifier.6.bias'):
                continue
            state_dict.update({k: v})
        self.load_state_dict(state_dict)

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = x.view(x.size(0), 256*6*6)
        x = self.classifier(x)
        return x

model = AlexNet(num_classes=1).to(dev)

model.features[0].weight.requires_grad = False
model.features[0].bias.requires_grad = False
model.features[3].weight.requires_grad = False
model.features[3].bias.requires_grad = False
model.features[6].weight.requires_grad = False
model.features[6].bias.requires_grad = False
model.features[8].weight.requires_grad = False
model.features[8].bias.requires_grad = False
model.features[10].weight.requires_grad = False
model.features[10].bias.requires_grad = False
model.classifier[1].weight.requires_grad = False
model.classifier[1].bias.requires_grad = False
model.classifier[4].weight.requires_grad = False
model.classifier[4].bias.requires_grad = False

model.classifier[6].weight.requires_grad = True
model.classifier[6].bias.requires_grad = True

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(filter(lambda p:p.requires_grad, model.parameters()), lr=0.0001, betas=(0.9, 0.999))


total_step = len(train_loader)
train_losses, val_losses, train_accs, val_accs = [],[],[],[]


for epoch in range(num_epochs):
    train_loss = 0
    val_loss = 0
    train_acc = 0
    val_acc = 0
    for images,labels in train_loader:
        images = images.to(dev)
        labels = labels.to(dev)

        outputs = model(images)
        loss = criterion(outputs, labels.type(torch.FloatTensor).to(dev))
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


        total = labels.size(0)
        labels = torch.squeeze(labels)

        predicted = np.where(outputs.cpu().detach().numpy()<0.5, 0, 1)
        predicted = torch.from_numpy(predicted)
        predicted = torch.squeeze(predicted)

        correct = (predicted == labels.cpu()).sum().item()
        train_acc += correct / total

Time observed before training:- 2019-09-26 10:38:14.336391
Time observed after training:- 2019-09-26 17:02:13.680405

Though the learnable parameters are 4097 still the model is taking almost 5 - 6 hrs for training.

Could you try to parse only the learnable parameters to the optimizer?
I think the problem is that even if grad param is set to false, it only means that values aren’t gonna be updated, but gradients are still computed as they have to be backpropagated to previous tensors.

import torch
import time
x = torch.rand(10,requires_grad=False)
y = torch.rand(10,requires_grad=False)
z = torch.rand(10,requires_grad=True)
def timing(f):
    def wrap(*args):
        time1 = time.time()
        ret = f(*args)
        time2 = time.time()
        print('{:s} function took {:.3f} ms'.format(f.__name__, (time2-time1)*1000.0))

        return ret
    return wrap
@timing
def run(optim,backprop=True):

    optim.zero_grad()
    q=x+y
    p=(z+q).mean()
    print(x)
    print(y)
    print(z)
    print(q)
    if backprop:
        p.backward()
    print(x.grad)
    print(y.grad)
    print(z.grad)
    print(q.grad)
    print(p.grad)
    if backprop:
        optim.step()
    print(x)
    print(y)
    print(z)
    print(q)

run( torch.optim.SGD([x, y, z], lr=1))
with torch.no_grad():
    run( torch.optim.SGD([x, y, z], lr=1),False)
run( torch.optim.SGD([x], lr=1))
tensor([0.8253, 0.7537, 0.7688, 0.9117, 0.2731, 0.4141, 0.5021, 0.5691, 0.8326,
        0.9245])
tensor([0.2086, 0.2583, 0.3304, 0.6977, 0.2884, 0.6722, 0.2859, 0.4168, 0.3863,
        0.2488])
tensor([0.0729, 0.0993, 0.8631, 0.5255, 0.8684, 0.8381, 0.3063, 0.4755, 0.3240,
        0.5715], requires_grad=True)
tensor([1.0340, 1.0120, 1.0993, 1.6094, 0.5615, 1.0863, 0.7880, 0.9859, 1.2189,
        1.1733])
None
None
tensor([0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000])
None
None
tensor([0.8253, 0.7537, 0.7688, 0.9117, 0.2731, 0.4141, 0.5021, 0.5691, 0.8326,
        0.9245])
tensor([0.2086, 0.2583, 0.3304, 0.6977, 0.2884, 0.6722, 0.2859, 0.4168, 0.3863,
        0.2488])
tensor([-2.7121e-02, -7.1023e-04,  7.6311e-01,  4.2552e-01,  7.6838e-01,
         7.3814e-01,  2.0632e-01,  3.7555e-01,  2.2402e-01,  4.7150e-01],
       requires_grad=True)
tensor([1.0340, 1.0120, 1.0993, 1.6094, 0.5615, 1.0863, 0.7880, 0.9859, 1.2189,
        1.1733])
run function took 62.251 ms
tensor([0.8253, 0.7537, 0.7688, 0.9117, 0.2731, 0.4141, 0.5021, 0.5691, 0.8326,
        0.9245])
tensor([0.2086, 0.2583, 0.3304, 0.6977, 0.2884, 0.6722, 0.2859, 0.4168, 0.3863,
        0.2488])
tensor([-2.7121e-02, -7.1023e-04,  7.6311e-01,  4.2552e-01,  7.6838e-01,
         7.3814e-01,  2.0632e-01,  3.7555e-01,  2.2402e-01,  4.7150e-01],
       requires_grad=True)
tensor([1.0340, 1.0120, 1.0993, 1.6094, 0.5615, 1.0863, 0.7880, 0.9859, 1.2189,
        1.1733])
None
None
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
None
None
tensor([0.8253, 0.7537, 0.7688, 0.9117, 0.2731, 0.4141, 0.5021, 0.5691, 0.8326,
        0.9245])
tensor([0.2086, 0.2583, 0.3304, 0.6977, 0.2884, 0.6722, 0.2859, 0.4168, 0.3863,
        0.2488])
tensor([-2.7121e-02, -7.1023e-04,  7.6311e-01,  4.2552e-01,  7.6838e-01,
         7.3814e-01,  2.0632e-01,  3.7555e-01,  2.2402e-01,  4.7150e-01],
       requires_grad=True)
tensor([1.0340, 1.0120, 1.0993, 1.6094, 0.5615, 1.0863, 0.7880, 0.9859, 1.2189,
        1.1733])
run function took 1.616 ms
tensor([0.8253, 0.7537, 0.7688, 0.9117, 0.2731, 0.4141, 0.5021, 0.5691, 0.8326,
        0.9245])
tensor([0.2086, 0.2583, 0.3304, 0.6977, 0.2884, 0.6722, 0.2859, 0.4168, 0.3863,
        0.2488])
tensor([-2.7121e-02, -7.1023e-04,  7.6311e-01,  4.2552e-01,  7.6838e-01,
         7.3814e-01,  2.0632e-01,  3.7555e-01,  2.2402e-01,  4.7150e-01],
       requires_grad=True)
tensor([1.0340, 1.0120, 1.0993, 1.6094, 0.5615, 1.0863, 0.7880, 0.9859, 1.2189,
        1.1733])
None
None
tensor([0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000])
None
None
tensor([0.8253, 0.7537, 0.7688, 0.9117, 0.2731, 0.4141, 0.5021, 0.5691, 0.8326,
        0.9245])
tensor([0.2086, 0.2583, 0.3304, 0.6977, 0.2884, 0.6722, 0.2859, 0.4168, 0.3863,
        0.2488])
tensor([-2.7121e-02, -7.1023e-04,  7.6311e-01,  4.2552e-01,  7.6838e-01,
         7.3814e-01,  2.0632e-01,  3.7555e-01,  2.2402e-01,  4.7150e-01],
       requires_grad=True)
tensor([1.0340, 1.0120, 1.0993, 1.6094, 0.5615, 1.0863, 0.7880, 0.9859, 1.2189,
        1.1733])
run function took 1.965 ms

If you look at this messy code, even if grads are false, the fact you are running the optimizer takes lot of time meanwhile if you go for a no_grad strategy or directly does not parse them to the optimizer you save that time.