Error while converting a pytorch model to onnx

Hello Friends,
I was trying to convert a CapsuleNet based model written in pytorch into onnx.The following is the errro I got.

  (conv1): Conv2d(9, 256, kernel_size=(1, 1), stride=(1, 1))
  (primarycaps): PrimaryCapsule(
    (conv2d): Conv2d(256, 256, kernel_size=(1, 1), stride=(2, 2))
  (digitcaps): DenseCapsule()
  (decoder): Sequential(
    (0): Linear(in_features=160, out_features=512, bias=True)
    (1): ReLU(inplace)
    (2): Linear(in_features=512, out_features=1024, bias=True)
    (3): ReLU(inplace)
    (4): Linear(in_features=1024, out_features=81, bias=True)
    (5): Sigmoid()
  (relu): ReLU()
Traceback (most recent call last):
  File "", line 95, in <module>
  File "", line 92, in test
    torch.onnx.export(model, dummy_input, 'file.onnx')
  File "/home/vijay/anaconda3/lib/python3.7/site-packages/torch/onnx/", line 27, in export
    return utils.export(*args, **kwargs)
  File "/home/vijay/anaconda3/lib/python3.7/site-packages/torch/onnx/", line 111, in export
  File "/home/vijay/anaconda3/lib/python3.7/site-packages/torch/onnx/", line 313, in _export
  File "/home/vijay/anaconda3/lib/python3.7/site-packages/torch/onnx/", line 237, in _model_to_graph
    graph, torch_out = _trace_and_get_graph_from_model(model, args, training)
  File "/home/vijay/anaconda3/lib/python3.7/site-packages/torch/onnx/", line 204, in _trace_and_get_graph_from_model
    trace, torch_out = torch.jit.get_trace_graph(model, args, _force_outplace=True)
  File "/home/vijay/anaconda3/lib/python3.7/site-packages/torch/jit/", line 219, in get_trace_graph
    return LegacyTracedModule(f, _force_outplace, return_inputs)(*args, **kwargs)
  File "/home/vijay/anaconda3/lib/python3.7/site-packages/torch/nn/modules/", line 491, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/vijay/anaconda3/lib/python3.7/site-packages/torch/jit/", line 276, in forward
    out = self.inner(*trace_inputs)
  File "/home/vijay/anaconda3/lib/python3.7/site-packages/torch/nn/modules/", line 489, in __call__
    result = self._slow_forward(*input, **kwargs)
  File "/home/vijay/anaconda3/lib/python3.7/site-packages/torch/nn/modules/", line 479, in _slow_forward
    result = self.forward(*input, **kwargs)
  File "", line 65, in forward
    x = self.digitcaps(x)
  File "/home/vijay/anaconda3/lib/python3.7/site-packages/torch/nn/modules/", line 489, in __call__
    result = self._slow_forward(*input, **kwargs)
  File "/home/vijay/anaconda3/lib/python3.7/site-packages/torch/nn/modules/", line 479, in _slow_forward
    result = self.forward(*input, **kwargs)
  File "/home/vijay/Documents/Vijay/Lidar/CapsNet-Pytorch/", line 54, in forward
    x_hat = torch.squeeze(torch.matmul(self.weight, x[:, None, :, :, None]), dim=-1)
RuntimeError: The size of tensor a (1152) must match the size of tensor b (800) at non-singleton dimension 2

Please find the code of the conversion script given below:

Pytorch implementation of CapsNet in paper Dynamic Routing Between Capsules.
The current version maybe only works for TensorFlow backend. Actually it will be straightforward to re-write to TF code.
Adopting to other backends should be easy, but I have not tested this.

       Launch `python -h` for usage help

    Validation accuracy > 99.6% after 50 epochs.
    Speed: About 73s/epoch on a single GTX1070 GPU card and 43s/epoch on a GTX1080Ti GPU.

Author: Xifeng Guo, E-mail: ``, Github: ``

import torch
from torch import nn
from torch.optim import Adam, lr_scheduler
from torch.autograd import Variable
from torchvision import transforms, datasets
from capsulelayers import DenseCapsule, PrimaryCapsule

class CapsuleNet(nn.Module):
    A Capsule Network on MNIST.
    :param input_size: data size = [channels, width, height]
    :param classes: number of classes
    :param routings: number of routing iterations
        - Input: (batch, channels, width, height), optional (batch, classes) .
        - Output:((batch, classes), (batch, channels, width, height))
    def __init__(self, input_size, classes, routings):
        super(CapsuleNet, self).__init__()
        self.input_size = input_size
        self.classes = classes
        self.routings = routings

        # Layer 1: Just a conventional Conv2D layer
        self.conv1 = nn.Conv2d(input_size[0], 256, kernel_size=1, stride=1, padding=0)

        # Layer 2: Conv2D layer with `squash` activation, then reshape to [None, num_caps, dim_caps]
        self.primarycaps = PrimaryCapsule(256, 256, 8, kernel_size=1, stride=2, padding=0)

        # Layer 3: Capsule layer. Routing algorithm works here.
        self.digitcaps = DenseCapsule(in_num_caps=32*6*6, in_dim_caps=8,
                                      out_num_caps=classes, out_dim_caps=16, routings=routings)

        # Decoder network.
        self.decoder = nn.Sequential(
            nn.Linear(16*classes, 512),
            nn.Linear(512, 1024),
            nn.Linear(1024, input_size[0] * input_size[1] * input_size[2]),

        self.relu = nn.ReLU()

    def forward(self, x, y=None):
        x = self.relu(self.conv1(x))
        x = self.primarycaps(x)
        x = self.digitcaps(x)
        length = x.norm(dim=-1)
        if y is None:  # during testing, no label given. create one-hot coding using `length`
            index = length.max(dim=1)[1]
            y = Variable(torch.zeros(length.size()).scatter_(1, index.view(-1, 1).cpu().data, 1.).cuda())
        reconstruction = self.decoder((x * y[:, :, None]).view(x.size(0), -1))
        return length, reconstruction.view(-1, *self.input_size)

def test():
   # model = ClassificationPointNet(10, 0.3, 3).to('cuda')
   # model = model.cuda()
    #dummy_input = torch.randn(64, 2, 3).to('cuda')
    #x = model(dummy_input)
    model = CapsuleNet(input_size=[9, 3, 3], classes=10, routings=3)
    dummy_input = torch.randn(256, 9, 9, 9).to('cuda')
    #torch.onnx.export(model, dummy_input, 'file.onnx'), '/home/vijay/Documents/Vijay/Lidar/CapsNet-Pytorch/data/MNIST/processed/')
    #device = torch.device("cuda")
    state_dict = torch.load('/home/vijay/Documents/Vijay/Lidar/CapsNet-Pytorch/data/MNIST/processed/')
   # print(tensor.device)

    dummy_input = torch.randn(256, 9, 9,9).to('cuda')
    torch.onnx.export(model, dummy_input, 'file.onnx')

if __name__ == '__main__':

If you model working fine in plain PyTorch without the ONNX export?
Unfortunately, I cannot run the code, as from capsulelayers import DenseCapsule, PrimaryCapsule is undefined.

Hello ptrblck,
Please find the code for capsulelayers given below:

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

def squash(inputs, axis=-1):
    The non-linear activation used in Capsule. It drives the length of a large vector to near 1 and small vector to 0
    :param inputs: vectors to be squashed
    :param axis: the axis to squash
    :return: a Tensor with same size as inputs
    norm = torch.norm(inputs, p=2, dim=axis, keepdim=True)
    scale = norm**2 / (1 + norm**2) / (norm + 1e-8)
    return scale * inputs

class DenseCapsule(nn.Module):
    The dense capsule layer. It is similar to Dense (FC) layer. Dense layer has `in_num` inputs, each is a scalar, the
    output of the neuron from the former layer, and it has `out_num` output neurons. DenseCapsule just expands the
    output of the neuron from scalar to vector. So its input size = [None, in_num_caps, in_dim_caps] and output size = \
    [None, out_num_caps, out_dim_caps]. For Dense Layer, in_dim_caps = out_dim_caps = 1.

    :param in_num_caps: number of cpasules inputted to this layer
    :param in_dim_caps: dimension of input capsules
    :param out_num_caps: number of capsules outputted from this layer
    :param out_dim_caps: dimension of output capsules
    :param routings: number of iterations for the routing algorithm
    def __init__(self, in_num_caps, in_dim_caps, out_num_caps, out_dim_caps, routings=3):
        super(DenseCapsule, self).__init__()
        self.in_num_caps = in_num_caps
        self.in_dim_caps = in_dim_caps
        self.out_num_caps = out_num_caps
        self.out_dim_caps = out_dim_caps
        self.routings = routings
        self.weight = nn.Parameter(0.01 * torch.randn(out_num_caps, in_num_caps, out_dim_caps, in_dim_caps))

    def forward(self, x):
        # x.size=[batch, in_num_caps, in_dim_caps]
        # expanded to    [batch, 1,            in_num_caps, in_dim_caps,  1]
        # weight.size   =[       out_num_caps, in_num_caps, out_dim_caps, in_dim_caps]
        # torch.matmul: [out_dim_caps, in_dim_caps] x [in_dim_caps, 1] -> [out_dim_caps, 1]
        # => x_hat.size =[batch, out_num_caps, in_num_caps, out_dim_caps]
        x_hat = torch.squeeze(torch.matmul(self.weight, x[:, None, :, :, None]), dim=-1)

        # In forward pass, `x_hat_detached` = `x_hat`;
        # In backward, no gradient can flow from `x_hat_detached` back to `x_hat`.
        x_hat_detached = x_hat.detach()

        # The prior for coupling coefficient, initialized as zeros.
        # b.size = [batch, out_num_caps, in_num_caps]
        b = Variable(torch.zeros(x.size(0), self.out_num_caps, self.in_num_caps)).cuda()

        assert self.routings > 0, 'The \'routings\' should be > 0.'
        for i in range(self.routings):
            # c.size = [batch, out_num_caps, in_num_caps]
            c = F.softmax(b, dim=1)

            # At last iteration, use `x_hat` to compute `outputs` in order to backpropagate gradient
            if i == self.routings - 1:
                # c.size expanded to [batch, out_num_caps, in_num_caps, 1           ]
                # x_hat.size     =   [batch, out_num_caps, in_num_caps, out_dim_caps]
                # => outputs.size=   [batch, out_num_caps, 1,           out_dim_caps]
                outputs = squash(torch.sum(c[:, :, :, None] * x_hat, dim=-2, keepdim=True))
                # outputs = squash(torch.matmul(c[:, :, None, :], x_hat))  # alternative way
            else:  # Otherwise, use `x_hat_detached` to update `b`. No gradients flow on this path.
                outputs = squash(torch.sum(c[:, :, :, None] * x_hat_detached, dim=-2, keepdim=True))
                # outputs = squash(torch.matmul(c[:, :, None, :], x_hat_detached))  # alternative way

                # outputs.size       =[batch, out_num_caps, 1,           out_dim_caps]
                # x_hat_detached.size=[batch, out_num_caps, in_num_caps, out_dim_caps]
                # => b.size          =[batch, out_num_caps, in_num_caps]
                b = b + torch.sum(outputs * x_hat_detached, dim=-1)

        return torch.squeeze(outputs, dim=-2)

class PrimaryCapsule(nn.Module):
    Apply Conv2D with `out_channels` and then reshape to get capsules
    :param in_channels: input channels
    :param out_channels: output channels
    :param dim_caps: dimension of capsule
    :param kernel_size: kernel size
    :return: output tensor, size=[batch, num_caps, dim_caps]
    def __init__(self, in_channels, out_channels, dim_caps, kernel_size, stride=1, padding=0):
        super(PrimaryCapsule, self).__init__()
        self.dim_caps = dim_caps
        self.conv2d = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)

    def forward(self, x):
        outputs = self.conv2d(x)
        outputs = outputs.view(x.size(0), -1, self.dim_caps)
        return squash(outputs)

Best Regards,

Thanks for the code.
Your model isn’t working with the specified shape in PyTorch-only, so this issue doesn’t seem to be ONNX specific.
DenseCapsule seems to raise the size mismatch in:

x_hat = torch.squeeze(torch.matmul(self.weight, x[:, None, :, :, None]), dim=-1)


RuntimeError: The size of tensor a (1152) must match the size of tensor b (800) at non-singleton dimension 2