Completely different results using Tensorflow and Pytorch for MobilenetV3 Small

I am using transfer learning from MobileNetV3 Small to predict 5 different points on an image. I am doing this as a regression task.

For both models:

  • Setting the last 50 layers trainable and adding the same fully connected layers to the end.
  • Learning rate 3e-2
  • Batch size 32
  • Adam optimizer with the same betas
  • 100 epochs
  • The inputs consist of RGB unscaled images

Pytorch

Model

def _init_weights(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

def get_mob_v3_small():
    model = torchvision.models.mobilenet_v3_small(pretrained=True)
    children_list = get_children(model)
          for c in children_list[:-50]:
                for p in c.parameters():
                    p.requires_grad = False

    return model

class TransferMobileNetV3_v2(nn.Module):
    def __init__(self,
            num_keypoints: int = 5):
        super(TransferMobileNetV3_v2, self).__init__()

        self.classifier_neurons = num_keypoints*2
        self.base_model = get_mob_v3_small()

        self.base_model.classifier = nn.Sequential(
                                            nn.Linear(in_features=1024, out_features=1024),
                                            nn.ReLU(),
                                            nn.Linear(in_features=1024, out_features=512),
                                            nn.ReLU(),
                                            nn.Linear(in_features=512, out_features=self.classifier_neurons)
                                        )

        self.base_model.apply(_init_weights)

    def forward(self, x):
        out = self.base_model(x)
        return out

Training Script

def train(net, trainloader, testloader, train_loss_fn, optimizer, scaler, args):

    len_dataloader = len(trainloader)

    for epoch in range(1, args.epochs+1):
        net.train()

        for batch_idx, sample in enumerate(trainloader):

            inputs, labels = sample
            inputs, labels = inputs.to(args.device), labels.to(args.device)

            optimizer.zero_grad()

            with torch.cuda.amp.autocast(args.use_amp):
                prediction = net(inputs)
                loss = train_loss_fn(prediction, labels)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

def main():
    args = make_args_parser()
    args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    seed = args.seed
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)

    loss_fn = nn.MSELoss()
    optimizer = optim.Adam(net.parameters(), lr=3e-2,
                        betas=(0.9, 0.999))
    scaler = torch.cuda.amp.GradScaler(enabled=args.use_amp)

    train(net, train_loader, test_loader, loss_fn, optimizer, scaler, args)

Tensorflow

Model

base_model = tf.keras.applications.MobileNetV3Small(weights='imagenet',input_shape=(224,224,3))

x_in = base_model.layers[-6].output

x = Dense(units=1024, activation="relu")(x_in)
x = Dense(units=512, activation="relu")(x)
x = Dense(units=10, activation="linear")(x)

model = Model(inputs=base_model.input, outputs=x)

for layer in model.layers[:-50]:
    layer.trainable=False

Training Script

model.compile(loss = "mse",
              optimizer = tf.keras.optimizers.Adam(learning_rate=3e-2))

history = model.fit(input_numpy, output_numpy,
                  verbose=1,
                  batch_size=32, epochs=100,validation_split = 0.2)

Results

  • The PyTorch model predicts one single point around the center for all 5 different points.
  • The Tensorflow model predicts the points quite well and are quite accurate.
  • The loss in the Pytorch model is much higher than the Tensorflow model.

Please do let me know what is going wrong as I am trying my best to shift to PyTorch for this work and I need this model to give me similar/identical results.

Note: I also noticed that the MobileNetV3 Small model seems to be different in PyTorch and different in Tensorflow. I do not know if am interpreting it wrong, but I’m putting it here just in case.