Hello,

I have created the following model:

```
def __init__(self, input_size, output_size, hidden_dim, n_layers, n_feats, drop_prob=0.5):
super(MySpeechRecognition, self).__init__()
#output_dim = will be the alphabet + '' and space = 28 chars
self.input_size = input_size
self.hidden_dim = hidden_dim
self.n_layers = n_layers
self.drop_prob = drop_prob
self.output_dim = output_size
# GRU Layer --> input (batch, channel*features, time)
# Input size = number of features
self.gru = nn.GRU(input_size, hidden_dim, n_layers, batch_first=True, dropout=drop_prob)
# shape output (batch, channel*features, time * hidden_size)
self.layer_norm = nn.LayerNorm(n_feats)
# (batch, channel, features, time)
#Fully Connected
self.fc1 = nn.Linear(self.hidden_dim,512)
self.fc2 = nn.Linear(512, self.output_dim)
self.dropout = nn.Dropout(0.2)
```

```
input_size = 128
n_classes = 29
hidden_dim = 250
n_layers = 2
n_feats = 128
batch_size = 32
```

And this is my training loop:

```
def train(n_epochs, train_loader, valid_loader, model, optimizer, criterion, clip, save_path):
data_len = len(train_loader.dataset)
t0 = time.time()
model.train()
for e in range(n_epochs):
#Initialize hidden state
running_losses = 0.0
test_losses = 0.0
loss_values = []
test_loss_values = []
h = model.init_hidden(batch_size)
#batch loop
for batch_idx, _data in enumerate(train_loader):
specs, labels, input_lengths, label_lengths = _data
#print("1")
if (train_on_gpu):
specs, labels = specs.cuda(), labels.cuda()
# Break if it is the last batch or the length is not the same as batch size because otherwise it will get error.
if (len(specs) != batch_size):
break
# Creating new variables for the hidden state, otherwise
# we'd backprop through the entire training history
h = h.data
# zero accumulated gradients
optimizer.zero_grad()
# get the output from the model
output, h = model(specs, h)
output = F.log_softmax(output, dim=2)
output = output.transpose(0,1)
# calculate the loss and perform backprop
loss = criterion(output, labels.float(), input_lengths, label_lengths)
loss.backward()
# `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
nn.utils.clip_grad_norm_(model.parameters(), clip)
optimizer.step()
scheduler.step()
# loss stats
running_loss =+ loss.item()*specs.size(0)
if batch_idx % 100 == 0 or batch_idx == data_len:
# Get Validation loss
val_losses = []
model.eval()
val_h = model.init_hidden(batch_size)
for batch_idx_v, _data in enumerate(valid_loader):
val_h = val_h.data
specs, labels, input_lengths, label_lengths = _data
#print(len(specs))
if (len(specs) != batch_size):
break
if(train_on_gpu):
specs, labels = specs.cuda(), labels.cuda()
with torch.no_grad():
output, h = model(specs, h)
output = F.log_softmax(output, dim=2)
output = output.transpose(0,1)
val_loss = criterion(output, labels.float(), input_lengths, label_lengths)
test_losses =+ val_loss.item()*specs.size(0)
model.train()
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\Valid Loss: {:.6f}'.format(
e+1, batch_idx * len(specs), data_len,
100. * batch_idx / len(train_loader), loss.item(), val_loss.item()))
loss_values.append(running_loss/data_len)
test_loss_values.append(test_losses)
torch.save(model.state_dict(), save_path)
print('Epoch {} + took {} seconds'.format(e+1, time.time() - t0))
return (model, loss_values, test_loss_values)
```

I am not sure if I can improve the training iteration in any way. It is taking too long even for 10 epochs…

I tried both with CPU and GPU on kaggle.

Thanks.