Key error converting PyTorch model to onnx

Hi,
I am trying to convert a ConvLSTM PyTorch network to onnx and I am encountering a RuntimeError: Error(s) in loading state_dict for ConvLSTM: Missing key(s) in state_dict: error during the conversion. I have searched online for a possible solution to no avail. Below is the code to convert the model

state_dict = torch.load('model.pth', map_location='cpu')

from collections import OrderedDict

new_state_dict = OrderedDict()

for k, v in state_dict.items():

    name = k[7:] # remove module.

new_state_dict[name] = v

model= ConvLSTM(input_size = (4, 8), input_dim = 3, hidden_dim = [64, 64, 128], kernel_size = (3, 3), num_layers = 3, batch_first = True)

model.load_state_dict(new_state_dict)

# define batch_size, channels, height, width for input placeholder

b, c, h, w = 1, 3, 4, 8

T = 6

x = Variable(torch.rand(T, b, c, h, w))

torch.onnx.export(model, x, 'model.onnx')

How do I resolve the error?

FYI, the full error message is

RuntimeError: Error(s) in loading state_dict for ConvLSTM:
	Missing key(s) in state_dict: "cell_list.0.conv.weight", "cell_list.0.conv.bias", "cell_list.1.conv.weight", "cell_list.1.conv.bias", "cell_list.2.conv.weight", "cell_list.2.conv.bias". 
	Unexpected key(s) in state_dict: "nv.bias".

Hi,

I think the problem happens before you try to convert it to onnx no? It seems to happen when you do load_state_dict()?
And the problem appears to be that the structure of the saved weights is not the same as the struture of the ConvLSTM that you now have.

yes, the problem appears during `load_state_dict(), I have looked at the structure of my model instantiation and the model constructor below from the model class and cannot seem to notice the issue

class ConvLSTM(nn.Module):

def __init__(self, input_size, input_dim, hidden_dim, kernel_size, num_layers,
             batch_first=False, bias=True, return_all_layers=False):
    super(ConvLSTM, self).__init__()

    self._check_kernel_size_consistency(kernel_size)

    # Make sure that both `kernel_size` and `hidden_dim` are lists having len == num_layers
    kernel_size = self._extend_for_multilayer(kernel_size, num_layers)
    hidden_dim  = self._extend_for_multilayer(hidden_dim, num_layers)
    if not len(kernel_size) == len(hidden_dim) == num_layers:
        raise ValueError('Inconsistent list length.')

    self.height, self.width = input_size

    self.input_dim  = input_dim
    self.hidden_dim = hidden_dim
    self.kernel_size = kernel_size
    self.num_layers = num_layers
    self.batch_first = batch_first
    self.bias = bias
    self.return_all_layers = return_all_layers

    cell_list = []
    for i in range(0, self.num_layers):
        cur_input_dim = self.input_dim if i == 0 else self.hidden_dim[i-1]

        cell_list.append(ConvLSTMCell(input_size=(self.height, self.width),
                                      input_dim=cur_input_dim,
                                      hidden_dim=self.hidden_dim[i],
                                      kernel_size=self.kernel_size[i],
                                      bias=self.bias))

    self.cell_list = nn.ModuleList(cell_list)

From the error message, your new model has this ModuleList called cell_list that contains two elements, each with a subelement called conv that contain both a weight and a bias that are not present in your saved state_dict.
And the saved state dict has a field nv that has a Parameter called bias that is not present in your current model.

thanks albanD, running the loader with strict=False, removes the error but the model file generated is definitely wrong as the original .pth was 600MB and the onnx generated is 12MB. If it’s not too much trouble, with the model class definition below, how does one initialize a CONVLSTM model say with dummy parameters–just want to see what i’m missing

class ConvLSTMCell(nn.Module):

def __init__(self, input_size, input_dim, hidden_dim, kernel_size, bias):
    """
    Initialize ConvLSTM cell.
    
    Parameters
    ----------
    input_size: (int, int)
        Height and width of input tensor as (height, width).
    input_dim: int
        Number of channels of input tensor.
    hidden_dim: int
        Number of channels of hidden state.
    kernel_size: (int, int)
        Size of the convolutional kernel.
    bias: bool
        Whether or not to add the bias.
    """

    super(ConvLSTMCell, self).__init__()

    self.height, self.width = input_size
    self.input_dim  = input_dim
    self.hidden_dim = hidden_dim

    self.kernel_size = kernel_size
    self.padding     = kernel_size[0] // 2, kernel_size[1] // 2
    self.bias        = bias
    
    self.conv = nn.Conv2d(in_channels=self.input_dim + self.hidden_dim,
                          out_channels=4 * self.hidden_dim,
                          kernel_size=self.kernel_size,
                          padding=self.padding,
                          bias=self.bias)

def forward(self, input_tensor, cur_state):
    
    h_cur, c_cur = cur_state
    
    combined = torch.cat([input_tensor, h_cur], dim=1)  # concatenate along channel axis
    
    combined_conv = self.conv(combined)
    cc_i, cc_f, cc_o, cc_g = torch.split(combined_conv, self.hidden_dim, dim=1) 
    i = torch.sigmoid(cc_i)
    f = torch.sigmoid(cc_f)
    o = torch.sigmoid(cc_o)
    g = torch.tanh(cc_g)

    c_next = f * c_cur + i * g
    h_next = o * torch.tanh(c_next)
    
    return h_next, c_next

def init_hidden(self, batch_size):
    return (Variable(torch.zeros(batch_size, self.hidden_dim, self.height, self.width)).cpu(),
            Variable(torch.zeros(batch_size, self.hidden_dim, self.height, self.width)).cpu())

class ConvLSTM(nn.Module):

def __init__(self, input_size, input_dim, hidden_dim, kernel_size, num_layers,
             batch_first=False, bias=True, return_all_layers=False):
    super(ConvLSTM, self).__init__()

    self._check_kernel_size_consistency(kernel_size)

    # Make sure that both `kernel_size` and `hidden_dim` are lists having len == num_layers
    kernel_size = self._extend_for_multilayer(kernel_size, num_layers)
    hidden_dim  = self._extend_for_multilayer(hidden_dim, num_layers)
    if not len(kernel_size) == len(hidden_dim) == num_layers:
        raise ValueError('Inconsistent list length.')

    self.height, self.width = input_size

    self.input_dim  = input_dim
    self.hidden_dim = hidden_dim
    self.kernel_size = kernel_size
    self.num_layers = num_layers
    self.batch_first = batch_first
    self.bias = bias
    self.return_all_layers = return_all_layers

    cell_list = []
    for i in range(0, self.num_layers):
        cur_input_dim = self.input_dim if i == 0 else self.hidden_dim[i-1]

        cell_list.append(ConvLSTMCell(input_size=(self.height, self.width),
                                      input_dim=cur_input_dim,
                                      hidden_dim=self.hidden_dim[i],
                                      kernel_size=self.kernel_size[i],
                                      bias=self.bias))

    self.cell_list = nn.ModuleList(cell_list)

def forward(self, input_tensor, hidden_state=None):
    """
    
    Parameters
    ----------
    input_tensor: todo 
        5-D Tensor either of shape (t, b, c, h, w) or (b, t, c, h, w)
    hidden_state: todo
        None. todo implement stateful
        
    Returns
    -------
    last_state_list, layer_output
    """
    if not self.batch_first:
        # (t, b, c, h, w) -> (b, t, c, h, w)
        input_tensor = input_tensor.permute(1, 0, 2, 3, 4)

    # Implement stateful ConvLSTM
    if hidden_state is not None:
        raise NotImplementedError()
    else:
        hidden_state = self._init_hidden(batch_size=input_tensor.size(0))

    layer_output_list = []
    last_state_list   = []

    seq_len = input_tensor.size(1)
    cur_layer_input = input_tensor

    for layer_idx in range(self.num_layers):

        h, c = hidden_state[layer_idx]
        output_inner = []
        for t in range(seq_len):

            h, c = self.cell_list[layer_idx](input_tensor=cur_layer_input[:, t, :, :, :],
                                             cur_state=[h, c])
            output_inner.append(h)

        layer_output = torch.stack(output_inner, dim=1)
        cur_layer_input = layer_output

        layer_output_list.append(layer_output)
        last_state_list.append([h, c])

    if not self.return_all_layers:
        layer_output_list = layer_output_list[-1:]
        last_state_list   = last_state_list[-1:]

    return layer_output_list, last_state_list

def _init_hidden(self, batch_size):
    init_states = []
    for i in range(self.num_layers):
        init_states.append(self.cell_list[i].init_hidden(batch_size))
    return init_states

@staticmethod
def _check_kernel_size_consistency(kernel_size):
    if not (isinstance(kernel_size, tuple) or
                (isinstance(kernel_size, list) and all([isinstance(elem, tuple) for elem in kernel_size]))):
        raise ValueError('`kernel_size` must be tuple or list of tuples')

@staticmethod
def _extend_for_multilayer(param, num_layers):
    if not isinstance(param, list):
        param = [param] * num_layers
    return param

this is my initialization

model= ConvLSTM(input_size = (4, 8), input_dim = 3, hidden_dim = [64, 64, 128], kernel_size = (3, 3), num_layers = 3, batch_first = True)

What do you mean by dummy parameters? By defaut, if you use Conv2d which you cell does use, the conv2d weights are initialized with the default initialization algorithm for conv2d. So all the parameters are there for sure.