Dear all,

I did some research but I could not find anything relevant.

I have LSTM and I would like it to work completely on the GPU to speed up training.

Everything works fine but slow because the system is partially using the GPU. I trained the same data set with similar LSTM model on keras and it is way faster. Do you have any idea what is wrong with my code?

I have the latest version of Pytorch and I am using Ubuntu 20.04 with an NVIDIA GTX 1070 with CUDA 11.2 .

This is my code:

```
class LSTM_CUDA(nn.Module):
def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
super(LSTM_CUDA, self).__init__()
self.hidden_dim = hidden_dim
self.num_layers = num_layers
self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers,batch_first=True)
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
h0 = torch.zeros(self.num_layers, x.size(1), self.hidden_dim).cuda()
c0 = torch.zeros(self.num_layers, x.size(1), self.hidden_dim).cuda()
out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
out = self.fc(out[-1])
return out
model_cuda = LSTM_CUDA(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers)
model_cuda = model_cuda.cuda()
criterion = torch.nn.CrossEntropyLoss()
criterion = criterion.cuda()
optimiser = torch.optim.Adam(model_cuda.parameters(), lr=0.01)
def validation_metrics (model, valid_dl):
model.eval()
correct = 0
total = 0
sum_loss = 0.0
sum_rmse = 0.0
for batch_idx, (x, y) in enumerate(valid_dl):
x = x.cuda()
y = y.long().cuda()
y_hat = model(x)
loss = criterion(y_hat, y)
pred = torch.max(y_hat, 1)[1]
if torch.cuda.is_available():
correct += (pred.cpu() == y.cpu()).sum()
else:
correct += (pred == y).sum()
#correct += (pred == y).float().sum()
total += y.shape[0]
sum_loss += loss.item()*y.shape[0]
if torch.cuda.is_available():
y = y.cpu()
pred = pred.cpu()
sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
return sum_loss/total, correct/total, sum_rmse/total
import time
start_time = time.time()
hist = np.zeros(10)
sum_loss = 0.0
total = 0
for t in range(num_epochs):
for batch_idx, (inputs, targets) in enumerate(train_loader):
model_cuda.train()
optimiser.zero_grad()
x = inputs.cuda()
y = targets.long().cuda()
y_train_pred = model_cuda(x)
loss = criterion(y_train_pred, y)
sum_loss += loss.item()*y.shape[0]
total += y.shape[0]
loss.backward()
optimiser.step()
print("Epoch ", t, "Loss train: ", loss.item())
hist[t] = loss.item()
val_loss, val_acc, val_rmse = validation_metrics(model_cuda, val_tensor)
print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))
print("--- %s seconds ---" % (time.time() - start_time))
```

Thanks.