CuDNN issues with Custom GRU | "params_from.size(0) == params_to.size(0)"

Hello, I’m trying to build a custom GRU with some additional weights and biases. The following code block in its entirety reproduces the issue. I’m using PyTorch version 1.8.2+cu111.

import math
import warnings
import torch
import torch.nn

from typing import Optional, Union
from torch.nn.utils.rnn import PackedSequence
from torch.nn.functional import softmax, one_hot
from torch import sigmoid, tanh

class BlockGRU(torch.nn.RNNBase):
    def __init__(
        self,
        input_size: int,
        hidden_size: int,
        num_layers: int = 1,
        num_blocks: int = 1,
        bias: bool = True,
        batch_first: bool = True,
        bidirectional: bool = False,
        beta: int = 10,
        device=None,
        dtype=None) -> None:

        super().__init__('GRU',
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            bias=bias,
            batch_first=batch_first,
            bidirectional=bidirectional)
        factory_kwargs = {'device': device, 'dtype': dtype}
        if num_blocks < 1:
            raise ValueError(
                'num_blocks must be >= 1')
        if (hidden_size/num_blocks) != (hidden_size//num_blocks):
            raise ValueError(
                'hidden_size must be evenly divisible by num_blocks')
        if bidirectional:
            raise NotImplementedError('no support for bidirectional GRU')

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_blocks = num_blocks
        self.block_size = hidden_size // num_blocks
        self.bias = bias
        self.beta = beta
        num_directions = 2 if bidirectional else 1
        self._flat_weights_names = []
        self._all_weights = []
        for layer in range(num_layers):
            for direction in range(num_directions):
                gate_size = 3 * hidden_size
                l_input_size = input_size
                if layer != 0:
                    l_input_size = hidden_size * num_directions
                w_ik = torch.nn.Parameter(
                    torch.empty((num_blocks, l_input_size), **factory_kwargs))
                w_hk = torch.nn.Parameter(
                    torch.empty((num_blocks, hidden_size), **factory_kwargs))
                w_ih = torch.nn.Parameter(
                    torch.empty((gate_size, l_input_size), **factory_kwargs))
                w_hh = torch.nn.Parameter(
                    torch.empty((gate_size, hidden_size), **factory_kwargs))
                b_ih = torch.nn.Parameter(
                    torch.empty(gate_size, **factory_kwargs))
                b_hh = torch.nn.Parameter(
                    torch.empty(gate_size, **factory_kwargs))
                b_ik = torch.nn.Parameter(
                    torch.empty(num_blocks, **factory_kwargs))
                b_hk = torch.nn.Parameter(
                    torch.empty(num_blocks, **factory_kwargs))
                if bias:
                    layer_params = (w_ik, w_hk, w_ih, w_hh, b_ik, b_hk, b_ih, b_hh)
                else:
                    layer_params = (w_ik, w_hk, w_ih, w_hh)

                suffix = '_reverse' if direction == 1 else ''
                param_names = [
                    'weight_ik_l{}{}',
                    'weight_hk_l{}{}',
                    'weight_ih_l{}{}',
                    'weight_hh_l{}{}',
                ]
                if bias:
                    param_names += [
                        'bias_ik_l{}{}',
                        'bias_hk_l{}{}',
                        'bias_ih_l{}{}',
                        'bias_hh_l{}{}',
                    ]
                param_names = [x.format(layer, suffix) for x in param_names]

                for name, param in zip(param_names, layer_params):
                    setattr(self, name, param)
                self._flat_weights_names.extend(param_names)
                self._all_weights.append(param_names)

        self._flat_weights = [
            (lambda wn: getattr(self, wn) if hasattr(self, wn) else None)(wn)
            for wn in self._flat_weights_names]
        self.flatten_parameters()
        self.reset_parameters()

net = BlockGRU(500, 100, 2, 2)
net = net.cuda()

The model actually works as intended on CPU, but I hit errors when I try to move the model to CUDA.

Traceback (most recent call last):
  File "experiment.py", line 123, in <module>
    main()
  File "experiment.py", line 60, in main
    net = net.cuda()
  File "/home/asivara/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 491, in cuda
    return self._apply(lambda t: t.cuda(device))
  File "/home/asivara/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 387, in _apply
    module._apply(fn)  File "/home/asivara/.local/lib/python3.8/site-packages/torch/nn/modules/rnn.py", line 186, in _apply
    self.flatten_parameters()
  File "/home/asivara/.local/lib/python3.8/site-packages/torch/nn/modules/rnn.py", line 172, in flatten_parameters
    torch._cudnn_rnn_flatten_weight(
RuntimeError: params_from.size(0) == params_to.size(0) INTERNAL ASSERT FAILED at "../aten/src/ATen/native/cudnn/RNN.cpp":626, please report a bug to PyTorch. number of layers mismatch

There’s some issue with flattening the parameters but I don’t have any idea where to start debugging this. I know I technically don’t need the redundant biases b_hh and b_hk, but apparently, according to comments in the RNNBase source code, they are needed for some CuDNN compatibility.

Can anyone point me in the right direction regarding how to get a custom GRU model working on GPU?

I found that I’m able to get my model to run on GPU if I commented out all calls to self.flatten_parameters(). But then the feed-forward / back-prop takes about 13x more time than with a PyTorch standard GRU model with an equivalent number of units and layers.

Really appreciate any help I can get to get this model working! Another way to address the question: is there a correct way to introduce new weights and biases to the GRU architecture? Thanks in advance!

I don’t know which additional weights and biases you are introducing, but I guess your new model architecture might not be compatible with the cuDNN definition, so disabling the parameter flattening and using the native approach might be the proper way to implement your custom module.