Sequence classification with variable length padded data Runtime error in training

I dont quite understand what is the problem with dimensions for linear layer as I had dimension mismatch but I used hidden of last timestamp i.e.20 to get linear classification

import numpy as np
import h5py
import torch
import torch.nn as nn
from torch.autograd import Variable
from keras.utils import to_categorical
from pdb import set_trace
# Here we define our model as a class
class LSTM(nn.Module):

    def __init__(self, input_dim, hidden_dim, batch_size, output_dim=1,
                    num_layers=2,features=200, s_len=20):
        super(LSTM, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.num_layers = num_layers
        self.hidden = self.init_hidden()
        self.s_len = s_len
        self.features=features
        
        # Define the LSTM layer
        self.lstm = nn.LSTM(self.input_dim, self.hidden_dim, self.num_layers)

        # Define the output layer
        self.linear = nn.Linear(self.hidden_dim, output_dim)

    def init_hidden(self):
        # This is what we'll initialise our hidden state as
        return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim),
                torch.zeros(self.num_layers, self.batch_size, self.hidden_dim))

    def forward(self, X):
        # Forward pass through LSTM layer
        # shape of lstm_out: [input_size, batch_size, hidden_dim]
        # shape of self.hidden: (a, b), where a and b both 
        # have shape (num_layers, batch_size, hidden_dim).

        # pack_padded_sequence so that padded items in the sequence won't be shown to the LSTM
        #input.view(len(input), self.batch_size, -1) i.e X
        ########for sequance length before padding as i have already padded sequances
        zeros = torch.zeros((1, self.features))
        lengths = torch.zeros((X.shape[0]))
        for i in range(X.shape[0]):
          zcount=0
          for j in range(self.s_len):
            if (X[i,j,:]==zeros).sum()==self.features:
              zcount+=1
          # print(count)
          lengths[i]=self.s_len-zcount
        ########################
        seq_lengths=lengths
        X = torch.nn.utils.rnn.pack_padded_sequence(X, seq_lengths, batch_first=True, enforce_sorted= False)
        lstm_out, self.hidden = self.lstm(X,self.hidden)
        # set_trace()
        # undo the packing operation
        lstm_out, _ = torch.nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)
        # set_trace()
        # Only take the output from the final timetep
        # Can pass on the entirety of lstm_out to the next layer if it is a seq2seq prediction
        #y_pred = self.linear(lstm_out[-1].view(self.batch_size, -1))
        y_pred = self.linear(lstm_out[:,-1,:])
        return y_pred.view(-1)


file_path1 = 'tr_20.mat'# 2097,20,200 with sample*seq*features
A01T = h5py.File(file_path1,'r')
temp_tr_data = np.asarray(np.copy(A01T['x']),dtype=np.float32)
tr_data = torch.from_numpy(temp_tr_data) 
tr_data.shape

file_path2 = 'tr_label.mat'#binary label
A02T = h5py.File(file_path2,'r')
tr_labels= np.copy(A02T['y'])
print(tr_labels)
print(tr_labels.shape)
print(tr_labels.dtype)
tr_labels = tr_labels[0,0:tr_data.shape[0]:1]
print(tr_labels.shape)
print(tr_labels.dtype)
tr_labels = np.asarray(tr_labels, dtype=np.float32)     #............
print(tr_labels.shape)
print(tr_labels.dtype)
tr_labels= torch.from_numpy(tr_labels) 
n_epochs = 500 
batch_size = 64
learning_rate = 0.01
feature=200
max_length = 20
hidden_size = 10
n_layers =2

#h0 = Variable(torch.randn(n_layers, batch_size, hidden_size))#h1

#Training LSTMs
model = LSTM(feature, hidden_size, batch_size=batch_size, num_layers=n_layers)

loss_fn = torch.nn.MSELoss(size_average=False)

optimiser = torch.optim.Adam(model.parameters(), lr=learning_rate)

#####################
# Train model
#####################
hist = np.zeros(n_epochs)


for epoch in range(n_epochs):
    model.zero_grad()
    # tr_data is a torch Variable
    permutation = torch.randperm(tr_data.size()[0])
    

    for i in range(0,tr_data.size()[0], batch_size):
        optimiser.zero_grad()
        indices = permutation[i:i+batch_size]
        batch_x, batch_y = tr_data[indices], tr_labels[indices]

        # in case you wanted a semi-full example
        y_pred = model.forward(batch_x)
        loss = loss_fn(y_pred, batch_y)

        if epoch % 100 == 0:
          print("Epoch ", epoch, "MSE: ", loss.item())
        hist[epoch] = loss.item()

        # Zero out gradient, else they will accumulate between epochs
        optimiser.zero_grad()

        loss.backward(retain_graph=True)# Backward pass
        optimiser.step()# Update parameters

 

Epoch 0 MSE: 24.005014419555664
Epoch 0 MSE: 20.296175003051758
Epoch 0 MSE: 14.667539596557617
/usr/local/lib/python3.6/dist-packages/torch/nn/_reduction.py:43: UserWarning: size_average and reduce args will be deprecated, please use reduction=‘sum’ instead.
warnings.warn(warning.format(ret))
Epoch 0 MSE: 11.551937103271484
Epoch 0 MSE: 9.712754249572754
Epoch 0 MSE: 7.439380645751953
Epoch 0 MSE: 7.079190254211426
Epoch 0 MSE: 13.051213264465332
Epoch 0 MSE: 9.997764587402344
Epoch 0 MSE: 11.738643646240234
Epoch 0 MSE: 12.547919273376465
Epoch 0 MSE: 6.7377543449401855
Epoch 0 MSE: 6.711551666259766
Epoch 0 MSE: 8.105769157409668
Epoch 0 MSE: 6.859475135803223
Epoch 0 MSE: 7.038944244384766
Epoch 0 MSE: 10.451923370361328
Epoch 0 MSE: 9.070074081420898
Epoch 0 MSE: 7.982135772705078
Epoch 0 MSE: 6.470491409301758
Epoch 0 MSE: 6.424145221710205
Epoch 0 MSE: 10.381178855895996
Epoch 0 MSE: 6.682828426361084
Epoch 0 MSE: 7.862491130828857
Epoch 0 MSE: 6.042342185974121
Epoch 0 MSE: 8.889896392822266
Epoch 0 MSE: 7.864719867706299
Epoch 0 MSE: 8.209149360656738
Epoch 0 MSE: 8.037969589233398
Epoch 0 MSE: 4.490922927856445
Epoch 0 MSE: 7.113046169281006
Epoch 0 MSE: 6.170615196228027
Epoch 0 MSE: 4.747644424438477

RuntimeError Traceback (most recent call last)
in ()
24
25 # in case you wanted a semi-full example
—> 26 y_pred = model.forward(batch_x)
27 loss = loss_fn(y_pred, batch_y)
28

6 frames
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/rnn.py in apply_permutation(tensor, permutation, dim)
19 def apply_permutation(tensor, permutation, dim=1):
20 # type: (Tensor, Tensor, int) -> Tensor
—> 21 return tensor.index_select(dim, permutation)
22
23

RuntimeError: invalid argument 3: out of range at /pytorch/aten/src/TH/generic/THTensor.cpp:318