How to use a network with Batchnorm on eval mode as a feature extractor for a different network

Hi,

What you said about requires_grad.False is true and as @Isaac_Kargar mentioned, you should not put model in eval mode. Putting model in eval mode, does not affect autograd engine and just uses eval mode in layers like dropout or batchnorm which in case of extracting features, it is not desired. (see this post)
Here is snippet that uses VGG16 with batch norm as feature extractor and a simple 1 layer linear model as model1 in your case. And it works just fine although the model is completely nonsense and I used simple code to demonstrate the idea.

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.models import vgg16_bn
from torch import optim
# a model that gets another model as feature_extractor
class Model(nn.Module):
    def __init__(self, feature_extractor):
        super(Model, self).__init__()
        self.feature_extractor = feature_extractor  # model2 in your case
        self.layer = nn.Linear(3*256*256, 1000,)  # model1 in your case
    def forward(self, x):
        features = self.feature_extractor(x)
        x = x.view(5, -1)
        x = self.layer(x)
        x += features  # using features from model2
        return x


model2 = vgg16_bn(pretrained=True).eval()
for param in model2.parameters():
    param.requires_grad = False
model1 = Model(feature_extractor=model2)
model1.train()

criterion = nn.L1Loss()
# note we only pass parameters of model1 to optimizer (possible mistake?)
optimizer = optim.SGD(model1.parameters(), lr=0.001, momentum=0.9) 

running_loss = 0.0
for i in range(3):
    x = torch.randn(5, 3, 256, 256)  # consider this as inputs that every batch changes
    optimizer.zero_grad()

    with torch.set_grad_enabled(True):
        outputs = model1(x)
        loss = criterion(outputs, torch.ones(outputs.shape))  # a weird loss!

        loss.backward()
        optimizer.step()

    running_loss += loss.item() * x.size(0)
    print(running_loss)

Here is a elaborate post about using pretrained models as feature extractors.

bests