Incorporating a CUDA kernel (TypeError: 'struct THCudaTensor' is opaque)

Hello,

I have been trying to incorporate my own CUDA kernel for a Highway LSTM into a PyTorch layer, mostly following the suggestions here: Compiling an Extension with CUDA files

Like what was suggested in that thread, I am reading the data from the tensors and running the kernel like this:

#include <THC/THC.h>
#include "highway_lstm_kernel.h"

extern THCState *state;

int highway_lstm_forward_cuda(int inputSize, int hiddenSize, int miniBatch,
        int numLayers, int seqLength,
        THCudaTensor *x,
        THCudaTensor *h_data,
        THCudaTensor *c_data,
        THCudaTensor *tmp_i,
        THCudaTensor *tmp_h,
        THCudaTensor *T,
        THCudaTensor *bias,
        THCudaTensor *dropout,
        THCudaTensor *gates,
        int isTraining) {

    float * x_ptr = THCudaTensor_data(state, x);
    float * h_data_ptr = THCudaTensor_data(state, h_data);
    float * c_data_ptr = THCudaTensor_data(state, c_data);
    float * tmp_i_ptr = THCudaTensor_data(state, tmp_i);
    float * tmp_h_ptr = THCudaTensor_data(state, tmp_h);
    float * T_ptr = THCudaTensor_data(state, T);
    float * bias_ptr = THCudaTensor_data(state, bias);
    float * dropout_ptr = THCudaTensor_data(state, dropout);
    float * gates_ptr = THCudaTensor_data(state, gates);

    cudaStream_t stream = THCState_getCurrentStream(state);
    cublasHandle_t handle = THCState_getCurrentBlasHandle(state);

    highway_lstm_ongpu(inputSize, hiddenSize, miniBatch, numLayers, seqLength, 
            x_ptr, h_data_ptr, c_data_ptr, tmp_i_ptr, tmp_h_ptr, T_ptr, bias_ptr,
            dropout_ptr, gates_ptr, isTraining, stream, handle);

    return 1;

}

And then I call this from within Python like so:

highway_lstm_layer.highway_lstm_forward_cuda(
        self.input_size, self.hidden_size, self.mini_batch, self.num_layers,
        self.seq_length, input, hy, cy, tmp_i, tmp_h, weight, bias, dropout,
        gates, 1 if self.train else 0)

However, I get the following error:

Traceback (most recent call last):
  File "highway_lstm_layer.py", line 112, in <module>
    print lstm(input)
  File "/home/nfitz/miniconda2/lib/python2.7/site-packages/torch/nn/modules/module.py", line 224, in __call__
    result = self.forward(*input, **kwargs)
  File "highway_lstm_layer.py", line 96, in forward
    output, hidden = HighwayLSTMFunction(self.input_size, self.hidden_size, num_layers=self.num_layers, dropout=self.dropout, train=self.train)(input, self.weight, self.bias)
  File "/home/nfitz/miniconda2/lib/python2.7/site-packages/torch/autograd/function.py", line 284, in _do_forward
    flat_output = super(NestedIOFunction, self)._do_forward(*flat_input)
  File "/home/nfitz/miniconda2/lib/python2.7/site-packages/torch/autograd/function.py", line 306, in forward
    result = self.forward_extended(*nested_tensors)
  File "highway_lstm_layer.py", line 34, in forward_extended
    gates, 1 if self.train else 0)
  File "/home/nfitz/miniconda2/lib/python2.7/site-packages/torch/utils/ffi/__init__.py", line 177, in safe_call
    result = torch._C._safe_call(*args, **kwargs)
TypeError: 'struct THCudaTensor' is opaque

Any hints on what would be causing this?