LSTM hidden states on CPU while model is moved to GPU

Hi all, a pytorch newbie here,

I was trying to use a stacked LSTM model for time series analysis, and I wanted to batched my input. The input tensors are put into dataloader and move to Cuda when I call
model(batch.to(device)) with model moved GPU as well , but still got error telling me input tensores are on GPU and hidden states on CPU

input_dim = 1
hidden_dim = 64
num_layers = 3
output_dim = 1
num_epochs = 40
lr = 0.01 # 0.01

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = 'cpu'
print(f'***** The current device is {device} !!! *****')

class LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim, bidirectional):
        super().__init__()
        d=1
        if bidirectional==True:
          d=2
        self.d=d
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, bidirectional=bidirectional, dropout=0.2)
        self.fc = nn.Linear(self.d*hidden_dim, output_dim)

    def forward(self, x):
        h0 = torch.zeros(self.d*self.num_layers, x.size(0), self.hidden_dim).requires_grad_()
        c0 = torch.zeros(self.d*self.num_layers, x.size(0), self.hidden_dim).requires_grad_()
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        out = self.fc(out[:, -1, :]) 
        return out

print(y_train.shape, type(y_train))
print(y_test.shape, type(y_test))

model = LSTM(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers
             , bidirectional=True)

loss_fn = torch.nn.MSELoss(reduction='mean')
optimiser = torch.optim.Adam(model.parameters(), lr=lr)

from torch.utils.data import Dataset, DataLoader, random_split

train_dataloader = DataLoader(x_train, batch_size=32, drop_last=False)
test_dataloader = DataLoader(x_test, batch_size=32, drop_last=False)
# print(len(train_dataloader), len(test_dataloader))

board_train = np.zeros(num_epochs)
board_test = np.zeros(num_epochs)

# execute per epoch
for t in range(num_epochs):    
    y_train_pred_lst = []
    for i, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        y_train_pred_batch = model(batch.to(device))
        y_train_pred_lst.append(y_train_pred_batch)
    
    y_train_pred = torch.cat(y_train_pred_lst)
    # print('Shape for y_train_pred:', y_train_pred.shape)
    loss_train = loss_fn.to(device)(y_train_pred, y_train)
    print("Epoch ", t+1, "MSE: ", loss_train.item())
    board_train[t] = loss_train.item()
    optimiser.zero_grad()
    loss_train.backward()
    optimiser.step()
    
    y_test_pred_lst = []
    for i, batch in tqdm(enumerate(test_dataloader), total=len(test_dataloader)):
        y_test_pred_batch = model(batch.to(device))
        y_test_pred_lst.append(y_test_pred_batch)
    
    y_test_pred = torch.cat(y_test_pred_lst)
    loss_test = loss_fn.to(device)(y_test_pred, y_test)
    board_test[t] = loss_test.item()
print('Training Completed !')

It seems you’ve forgotten to move the model to the GPU as well via model.to(device).