Executorch Model Output not same as the Base Model

nbansal90 · September 10, 2025, 10:28pm

Hello,

I am using Executorch for deployment and converting my model from base model to Executorch. Now when I am trying to verify the output from my Executorch model (Using Python based Executorch Runtime APIs), its fails to generate the correct output and generates output which is completely wrong. ( I am training for an Image → Image task)

To verify the working, and check intermediate values, I truncated the original model I am using and used a very basic model, to replicate the issue, and I see that intermediate output produced by the original model and the Executorch model is entirely different. Could someone please help me verify, if I am doing the right thing here !

**Original Model (and its Evaluation):**

import torch
import torch.nn as nn
import torch.nn.functional as F


class ConvBlock(nn.Module):
    def __init__(self, in_channel, out_channel, strides=1):
        super(ConvBlock, self).__init__()
        self.strides = strides
        self.in_channel=in_channel
        self.out_channel=out_channel
        self.block = nn.Sequential(
            nn.Conv2d(in_channel, out_channel, kernel_size=3, stride=strides, padding=1),
            nn.LeakyReLU(inplace=True),
            nn.Conv2d(out_channel, out_channel, kernel_size=3, stride=strides, padding=1),
            nn.LeakyReLU(inplace=True),
        )
        self.conv11 = nn.Conv2d(in_channel, out_channel, kernel_size=1, stride=strides, padding=0)

    def forward(self, x):
        out1 = self.block(x)
        out2 = self.conv11(x)
        out = out1 + out2
        return out


class UNet(nn.Module):
    def __init__(self, block=ConvBlock,dim=8):
        super(UNet, self).__init__()

        self.dim = dim
        self.ConvBlock1 = ConvBlock(3, dim, strides=1)
        self.pool1 = nn.Conv2d(dim,dim,kernel_size=4, stride=2, padding=1)

        self.ConvBlock2 = block(dim, dim*2, strides=1)
        self.pool2 = nn.Conv2d(dim*2,dim*2,kernel_size=4, stride=2, padding=1)

        self.ConvBlock3 = block(dim*2, dim*4, strides=1)
        self.pool3 = nn.Conv2d(dim*4,dim*4,kernel_size=4, stride=2, padding=1)

        self.ConvBlock4 = block(dim*4, dim*8, strides=1)
        self.pool4 = nn.Conv2d(dim*8, dim*8,kernel_size=4, stride=2, padding=1)

        self.ConvBlock5 = block(dim*8, dim*16, strides=1)

    def forward(self, x):
        conv1 = self.ConvBlock1(x)
        pool1 = self.pool1(conv1)

        conv2 = self.ConvBlock2(pool1)
        pool2 = self.pool2(conv2)

        conv3 = self.ConvBlock3(pool2)
        pool3 = self.pool3(conv3)

        conv4 = self.ConvBlock4(pool3)
        pool4 = self.pool4(conv4)

        conv5 = self.ConvBlock5(pool4)
        return conv5

def load_torch(weights):
    # Load the Torch model
    # Get model weights and parameters
    model = UNet()

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to('cpu')

    state_dict = torch.load(weights)['state_dict']
    new_state_dict = OrderedDict()

    # load new dict to remove 'module.'
    for k, v in state_dict.items():
        if 'module' in k:
            name = k[7:]
        else:
            name = k
        new_state_dict[name] = v

    model.load_state_dict(new_state_dict, strict=False)
    model.eval()
    return model


if __name__ == "__main__":
    import numpy as np
    import cv2
    import os

    from collections import OrderedDict

    torch_path = os.path.join('../../models','denoising_unet_sidd_1_39_06.pth')
    model = load_torch(torch_path)

    image_dir = os.path.join('../../dataset/SIDD/val/LR/')


    for img in os.listdir(image_dir):
        input_data = cv2.imread(os.path.join(image_dir, img))[np.newaxis, ...]
        input_data = np.transpose(input_data, axes=(0,3,1,2)).astype(np.float32)
        input_data = torch.from_numpy(input_data/255.)

        output_data = model(input_data)

        output_data = np.squeeze(output_data.detach().cpu().numpy())
        np.save(img.split('.')[0] + '.npy' , output_data)

Output is of shape: [128, 16, 16]
I am printing : output [0,0,:]
array([-0.02719986, -0.07920527, -0.08054526, -0.09254298, -0.13592963,
-0.06691809, 0.02102921, -0.07319918, -0.08861651, -0.0484544 ,
-0.11791661, -0.02734638, -0.06298678, -0.07550491, -0.08531246,
-0.03409318], dtype=float32)

Executorch Model Conversion Code:

import os
import torch

from runpy import run_path
from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
from executorch.exir import to_edge_transform_and_lower
from torch.export import Dim, export
from collections import OrderedDict


def load_torch(weights):
    # Load the Torch model
    # Get model weights and parameters
    load_arch = run_path('./unet.py')
    convBlock = load_arch['ConvBlock']
    parameters = {'block':convBlock, 'dim':8}
    model = load_arch['UNet'](**parameters)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to('cpu')

    state_dict = torch.load(weights)['state_dict']
    new_state_dict = OrderedDict()

    # load new dict to remove 'module.'
    for k, v in state_dict.items():
        if 'module' in k:
            name = k[7:]
        else:
            name = k
        new_state_dict[name] = v

    model.load_state_dict(new_state_dict, strict=False)
    model.eval()
    return model



def torch2execu(torch_path, execu_path):

    model = load_torch(torch_path)
    model.eval()
    inputs = (torch.randn(2,3,256,256),)

    dynamic_shapes = {
        "x": {
            0: Dim("b", min=1, max=4),
            2: Dim("h", min=256, max=512),
            3: Dim("w", min=256, max=512),
        }
    }

    exported_program = export(model, inputs, dynamic_shapes=dynamic_shapes)
    executorch_program = to_edge_transform_and_lower(
        exported_program,
        partitioner = [XnnpackPartitioner()]
        ).to_executorch()

    with open(execu_path, "wb") as file:
        file.write(executorch_program.buffer)

# main
torch_path = os.path.join('../../models', 'denoising_unet_sidd_1_39_06.pth')
execu_path = 'unet_1_256x256.pte'
torch2execu(torch_path, execu_path)

Successfully converts to .pte file.

Executing the Executorch File:

import torch
import os
import cv2
import numpy as np

from executorch.runtime import Runtime, Program, Verification
model_path = os.path.join(‘unet_1_256x256.pte’)
image_dir = os.path.join(‘../../dataset/SIDD/val/LR/’)


runtime = Runtime.get()
program = runtime.load_program(model_path)
method = program.load_method(“forward”)

for img in os.listdir(image_dir):
    input_data = cv2.imread(os.path.join(image_dir, img))[np.newaxis, …]
    input_data = np.transpose(input_data, axes=(0,3,1,2)).astype(np.float32)
    input_data = torch.from_numpy(input_data/255.)
    output_data = method.execute([input_data])[0]

    output_data = np.squeeze(output_data.numpy())
    np.save(img.split('.')[0] + '.npy' , output_data)

Output is of shape: [128, 16, 16]
I am printing : output [0,0,:]

array([ 0.14626774, 0.05134978, -0.00781145, 0.01976327, 0.09649369,
0.00808653, -0.00127539, 0.00698734, -0.03545413, 0.0016862 ,
-0.00149567, -0.03094362, 0.03917985, -0.04818957, 0.08538197,
0.05506491], dtype=float32)

Which is entirely different from the output of the original model.

PS: You would notice, i have earlier said that , i am training/evaluating for a Image → Image task, but there is no upsampler layers, because i have provide here a truncated model, to verify the output after a certain layer.

Also, I had converted my pytorch model to tflite and checked the output of the tflite model (in tflitetuntime) and output matches with the original model. So there is something which has gone wrong with my Executorch model itself, any insight would be helpful.

Thank you!