Hello, I’m trying to build a custom GRU with some additional weights and biases. The following code block in its entirety reproduces the issue. I’m using PyTorch version 1.8.2+cu111.
import math
import warnings
import torch
import torch.nn
from typing import Optional, Union
from torch.nn.utils.rnn import PackedSequence
from torch.nn.functional import softmax, one_hot
from torch import sigmoid, tanh
class BlockGRU(torch.nn.RNNBase):
def __init__(
self,
input_size: int,
hidden_size: int,
num_layers: int = 1,
num_blocks: int = 1,
bias: bool = True,
batch_first: bool = True,
bidirectional: bool = False,
beta: int = 10,
device=None,
dtype=None) -> None:
super().__init__('GRU',
input_size=input_size,
hidden_size=hidden_size,
num_layers=num_layers,
bias=bias,
batch_first=batch_first,
bidirectional=bidirectional)
factory_kwargs = {'device': device, 'dtype': dtype}
if num_blocks < 1:
raise ValueError(
'num_blocks must be >= 1')
if (hidden_size/num_blocks) != (hidden_size//num_blocks):
raise ValueError(
'hidden_size must be evenly divisible by num_blocks')
if bidirectional:
raise NotImplementedError('no support for bidirectional GRU')
self.input_size = input_size
self.hidden_size = hidden_size
self.num_blocks = num_blocks
self.block_size = hidden_size // num_blocks
self.bias = bias
self.beta = beta
num_directions = 2 if bidirectional else 1
self._flat_weights_names = []
self._all_weights = []
for layer in range(num_layers):
for direction in range(num_directions):
gate_size = 3 * hidden_size
l_input_size = input_size
if layer != 0:
l_input_size = hidden_size * num_directions
w_ik = torch.nn.Parameter(
torch.empty((num_blocks, l_input_size), **factory_kwargs))
w_hk = torch.nn.Parameter(
torch.empty((num_blocks, hidden_size), **factory_kwargs))
w_ih = torch.nn.Parameter(
torch.empty((gate_size, l_input_size), **factory_kwargs))
w_hh = torch.nn.Parameter(
torch.empty((gate_size, hidden_size), **factory_kwargs))
b_ih = torch.nn.Parameter(
torch.empty(gate_size, **factory_kwargs))
b_hh = torch.nn.Parameter(
torch.empty(gate_size, **factory_kwargs))
b_ik = torch.nn.Parameter(
torch.empty(num_blocks, **factory_kwargs))
b_hk = torch.nn.Parameter(
torch.empty(num_blocks, **factory_kwargs))
if bias:
layer_params = (w_ik, w_hk, w_ih, w_hh, b_ik, b_hk, b_ih, b_hh)
else:
layer_params = (w_ik, w_hk, w_ih, w_hh)
suffix = '_reverse' if direction == 1 else ''
param_names = [
'weight_ik_l{}{}',
'weight_hk_l{}{}',
'weight_ih_l{}{}',
'weight_hh_l{}{}',
]
if bias:
param_names += [
'bias_ik_l{}{}',
'bias_hk_l{}{}',
'bias_ih_l{}{}',
'bias_hh_l{}{}',
]
param_names = [x.format(layer, suffix) for x in param_names]
for name, param in zip(param_names, layer_params):
setattr(self, name, param)
self._flat_weights_names.extend(param_names)
self._all_weights.append(param_names)
self._flat_weights = [
(lambda wn: getattr(self, wn) if hasattr(self, wn) else None)(wn)
for wn in self._flat_weights_names]
self.flatten_parameters()
self.reset_parameters()
net = BlockGRU(500, 100, 2, 2)
net = net.cuda()
The model actually works as intended on CPU, but I hit errors when I try to move the model to CUDA.
Traceback (most recent call last):
File "experiment.py", line 123, in <module>
main()
File "experiment.py", line 60, in main
net = net.cuda()
File "/home/asivara/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 491, in cuda
return self._apply(lambda t: t.cuda(device))
File "/home/asivara/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 387, in _apply
module._apply(fn) File "/home/asivara/.local/lib/python3.8/site-packages/torch/nn/modules/rnn.py", line 186, in _apply
self.flatten_parameters()
File "/home/asivara/.local/lib/python3.8/site-packages/torch/nn/modules/rnn.py", line 172, in flatten_parameters
torch._cudnn_rnn_flatten_weight(
RuntimeError: params_from.size(0) == params_to.size(0) INTERNAL ASSERT FAILED at "../aten/src/ATen/native/cudnn/RNN.cpp":626, please report a bug to PyTorch. number of layers mismatch
There’s some issue with flattening the parameters but I don’t have any idea where to start debugging this. I know I technically don’t need the redundant biases b_hh
and b_hk
, but apparently, according to comments in the RNNBase source code, they are needed for some CuDNN compatibility.
Can anyone point me in the right direction regarding how to get a custom GRU model working on GPU?