Hi,

The training step of LSTM NN consumes 15+ min just for the first epoch. It seems I made a mistake somewhere.

```
def train_model(model, epochs=epochs_default_number, lr=lr_default_value):
parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameters, lr=lr)
for i in range(epochs):
model.train()
sum_loss = 0.0
total = 0
for x, y, l in train_dl:
x = x.long()
y = y.long()
y_pred = model(x, l)
loss = F.cross_entropy(y_pred, y.to(device))
optimizer.zero_grad()
loss.backward()
optimizer.step()
sum_loss += loss.item()*y.shape[0]
total += y.shape[0]
val_loss, val_acc, val_rmse = validation_metrics(model, val_dl)
if i % 5 == 1:
print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))
```

```
def validation_metrics (model, valid_dl):
model.eval()
correct = 0
total = 0
sum_loss = 0.0
sum_rmse = 0.0
for x, y, l in valid_dl:
x = x.long()
y = y.long()
y_hat = model(x, l)
loss = F.cross_entropy(y_hat, y.to(device))
pred = torch.max(y_hat, 1)[1]
correct += (pred == y.to(device)).float().sum()
total += y.shape[0]
sum_loss += loss.item()*y.shape[0]
sum_rmse += np.sqrt(mean_squared_error(pred.cpu(), y.unsqueeze(-1)))*y.shape[0]
return sum_loss/total, correct/total, sum_rmse/total
```

```
class LSTM_fl(torch.nn.Module) :
def __init__(self, vocab_size, embedding_dim, hidden_dim,number_of_layers) :
super().__init__()
self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers= number_of_layers, batch_first=True)
self.linear = nn.Linear(hidden_dim, number_of_output_classes)
self.dropout = nn.Dropout(dropout_value)
def forward(self, x, l):
x = self.embeddings(x.to(device))
x = self.dropout(x.to(device))
lstm_out, (ht, ct) = self.lstm(x.to(device))
return self.linear(ht[-1])
```

```
model_fixed = LSTM_fl(vocab_size, embedding_dim_value, hidden_dim_value, number_of_layers_value)
model_fixed = model_fixed.to(device)
train_model(model_fixed, epochs=epochs_number, lr=lr_value)
```

The values used are (I played with lr and batch_size but without success):

lr_value = 0.00001

lr_default_value = 0.001

epochs_number = 30

embedding_dim_value = 50

hidden_dim_value = 50

batch_size_value = 8

number_of_layers_value = 2

dropout_value = 0.2

Does anyone have advice on how to speed it up?

Thanks a lot