Greetings!

I implemented a layer-normalized LSTMCell from scratch. Everything works fine but it is much slower than the original LSTM. I noticed that the original LSTMCell is based on the LSTMFused_updateOutput which is implemented with C code. I am wandering if there is some easy way to speed up the LayerNorm LSTM without modifying the C implementation in the backend? Thank you very much!

Here is my code

```
class LayerNorm(nn.Module):
def __init__(self, nb_features, eps = 1e-5):
super(LayerNorm, self).__init__()
self.eps = eps
self.gain = nn.Parameter(torch.ones(1, nb_features))
self.bias = nn.Parameter(torch.zeros(1, nb_features))
def forward(self, input):
mean = input.mean(1).expand_as(input)
std = input.std(1).expand_as(input)
x = (input - mean) / (std + self.eps)
return x * self.gain.expand_as(x) + self.bias.expand_as(x)
class LayerNormLSTMCell(nn.Module):
def __init__(self, input_size, hidden_size):
super(LayerNormLSTMCell, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.weight_ih = Parameter(torch.Tensor(4 * hidden_size, input_size))
self.weight_hh = Parameter(torch.Tensor(4 * hidden_size, hidden_size))
self.bias_ih = Parameter(torch.Tensor(4 * hidden_size))
self.bias_hh = Parameter(torch.Tensor(4 * hidden_size))
self.ln_ih = LayerNorm(4 * hidden_size)
self.ln_hh = LayerNorm(4 * hidden_size)
self.ln_ho = LayerNorm(hidden_size)
def forward(self, input, hidden):
hx, cx = hidden
gates = self.ln_ih(F.linear(input, self.weight_ih, self.bias_ih)) + self.ln_hh(F.linear(hx, self.weight_hh, self.bias_hh))
ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
ingate = F.sigmoid(ingate)
forgetgate = F.sigmoid(forgetgate)
cellgate = F.tanh(cellgate)
outgate = F.sigmoid(outgate)
cy = (forgetgate * cx) + (ingate * cellgate)
hy = outgate * F.tanh(self.ln_ho(cy))
return hy, cy
```