Assertion fail in slice_op when running caffe2 model

Running a model I trained with pytorch in caffe2. The error is:

256
starting
terminate called after throwing an instance of 'at::Error'
  what():  [enforce fail at slice_op.h:28] data.ndim() >= starts.size(). 2 vs 3Error from operator: 
input: "1" output: "OC2_DUMMY_80/initial_h" name: "" type: "Slice" arg { name: "ends" ints: 1 ints: -1 ints: -1 } arg { name: "starts" ints: 0 ints: 0 ints: 0 } device_option { device_type: 0 device_id: 0 }
Aborted (core dumped)

EDIT: My [1:] and [:-1] slicing does not seem to be the problem
I’ve got a slice operation in my forward step. I do not understand where this error is coming from because none of the sliced tensors have 2 dimensions (they have 3). (the input vector, see below, is used to index into an embedding so it should change from 2 to 3 dimensional)
The fact it says input: "1" and initial_h makes me think the problem is with my hidden state possibly, but that is never sliced and making changes to it has no effect at all (increased the length of the input vector has no effect, changes to the number of dimensions cause it to crash as one would expect during the rnn call).

Here the inference code

int main(int argc, const char* argv[]) {
    caffe2::NetDef init_net, predict_net;

    ReadProtoFromFile("/work/case/init_net.pb", &init_net);
    ReadProtoFromFile("/work/case/predict_net.pb", &predict_net);

    caffe2::Predictor* pred;

    pred = new caffe2::Predictor(init_net, predict_net);

    std::vector<long int> data = {1, 1}; 
    caffe2::CPUContext cpu_context;
    caffe2::TensorCPU input({2, 1}, data, &cpu_context);

    std::vector<float> init_state;
    readInitState(init_state);
    std::cout<<init_state.size()<<std::endl;
    caffe2::TensorCPU hidden_input({1, 1, 256}, init_state, &cpu_context);

    caffe2::Predictor::TensorList input_vec = {input, hidden_input};
    caffe2::Predictor::TensorList output_vec;

    std::cout<< "starting" << std::endl;
    (*pred)(input_vec, &output_vec);
    std::cout<< "finished" << std::endl;
    for(int i = 0; i < output_vec[0].size(); i++) {
        std::cout << (output_vec[0].template data<float>())[i] << std::endl;
    }   
}

And here the model in pytorch

class CRNN(nn.Module):
    def __init__(self, num_inp, num_hid, num_ff, num_layers, num_out, inv_priors):
        super(CRNN, self).__init__()

        self.embed = nn.Embedding(num_inp, num_hid)
        self.rnn = nn.GRU(num_hid, num_hid, num_layers)
        self.fc_emb_skip = nn.Linear(num_hid, num_ff)
        self.fc1 = nn.Linear(num_hid + num_ff, num_ff)
        self.fc2 = nn.Linear(num_ff, 3)
        self.hidden_init = nn.Parameter(t.randn(num_layers, 1, num_hid).type(t.FloatTensor), requires_grad=True)
        self.num_inp = num_inp
        self.num_hid = num_hid
        self.num_out = num_out
        self.num_layers = num_layers

    def forward(self, x, hidden):
        emb = self.embed(x)
        output, hidden = self.rnn(emb, hidden)
        y_skip = F.elu(self.fc_emb_skip(emb[:-1]))
        joined_out = t.cat((y_skip, output[1:],), dim=2)
        outview = joined_out.contiguous().view(joined_out.size(0) * joined_out.size(1), joined_out.size(2))
        y = F.elu(self.fc1(outview))
        probs = F.log_softmax(self.fc2(y), 1)
        return probs, hidden