Completely different results using Tensorflow and Pytorch for MobilenetV3 Small

Kaustubh_Kulkarni · June 16, 2021, 9:04am

I am using transfer learning from MobileNetV3 Small to predict 5 different points on an image. I am doing this as a regression task.

For both models:

Setting the last 50 layers trainable and adding the same fully connected layers to the end.
Learning rate 3e-2
Batch size 32
Adam optimizer with the same betas
100 epochs
The inputs consist of RGB unscaled images

Pytorch

Model

def _init_weights(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

def get_mob_v3_small():
    model = torchvision.models.mobilenet_v3_small(pretrained=True)
    children_list = get_children(model)
          for c in children_list[:-50]:
                for p in c.parameters():
                    p.requires_grad = False

    return model

class TransferMobileNetV3_v2(nn.Module):
    def __init__(self,
            num_keypoints: int = 5):
        super(TransferMobileNetV3_v2, self).__init__()

        self.classifier_neurons = num_keypoints*2
        self.base_model = get_mob_v3_small()

        self.base_model.classifier = nn.Sequential(
                                            nn.Linear(in_features=1024, out_features=1024),
                                            nn.ReLU(),
                                            nn.Linear(in_features=1024, out_features=512),
                                            nn.ReLU(),
                                            nn.Linear(in_features=512, out_features=self.classifier_neurons)
                                        )

        self.base_model.apply(_init_weights)

    def forward(self, x):
        out = self.base_model(x)
        return out

Training Script

def train(net, trainloader, testloader, train_loss_fn, optimizer, scaler, args):

    len_dataloader = len(trainloader)

    for epoch in range(1, args.epochs+1):
        net.train()

        for batch_idx, sample in enumerate(trainloader):

            inputs, labels = sample
            inputs, labels = inputs.to(args.device), labels.to(args.device)

            optimizer.zero_grad()

            with torch.cuda.amp.autocast(args.use_amp):
                prediction = net(inputs)
                loss = train_loss_fn(prediction, labels)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

def main():
    args = make_args_parser()
    args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    seed = args.seed
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)

    loss_fn = nn.MSELoss()
    optimizer = optim.Adam(net.parameters(), lr=3e-2,
                        betas=(0.9, 0.999))
    scaler = torch.cuda.amp.GradScaler(enabled=args.use_amp)

    train(net, train_loader, test_loader, loss_fn, optimizer, scaler, args)

Tensorflow

Model

base_model = tf.keras.applications.MobileNetV3Small(weights='imagenet',input_shape=(224,224,3))

x_in = base_model.layers[-6].output

x = Dense(units=1024, activation="relu")(x_in)
x = Dense(units=512, activation="relu")(x)
x = Dense(units=10, activation="linear")(x)

model = Model(inputs=base_model.input, outputs=x)

for layer in model.layers[:-50]:
    layer.trainable=False

Training Script

model.compile(loss = "mse",
              optimizer = tf.keras.optimizers.Adam(learning_rate=3e-2))

history = model.fit(input_numpy, output_numpy,
                  verbose=1,
                  batch_size=32, epochs=100,validation_split = 0.2)

Results

The PyTorch model predicts one single point around the center for all 5 different points.
The Tensorflow model predicts the points quite well and are quite accurate.
The loss in the Pytorch model is much higher than the Tensorflow model.

Please do let me know what is going wrong as I am trying my best to shift to PyTorch for this work and I need this model to give me similar/identical results.

Note: I also noticed that the MobileNetV3 Small model seems to be different in PyTorch and different in Tensorflow. I do not know if am interpreting it wrong, but I’m putting it here just in case.