Please can you guys check this code for me ,because is not training

#%% LSTM architecture
class LSTM(nn.Module):

def __init__(self, input_dim, hidden_dim, batch_size,num_layers,output_dim):
    super(LSTM, self).__init__()
    self.input_dim = input_dim
    self.hidden_dim = hidden_dim
    self.batch_size = batch_size
    self.num_layers = num_layers

    # Define the LSTM layer
    self.lstm = nn.LSTM(self.input_dim, self.hidden_dim, self.num_layers)

    # Define the output layer
    self.linear = nn.Linear(self.hidden_dim, output_dim)
def init_hidden(self):
    return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim),
            torch.zeros(self.num_layers, self.batch_size, self.hidden_dim))
def forward(self, input):
    lstm_out, self.hidden = self.lstm(input.view(len(input), self.batch_size, -1))
    # Only take the output from the final timestep
    y_pred = self.linear(lstm_out[-1].view(self.batch_size, -1))
    return y_pred.view(-1)

#%% Train the Model

loss_epoch_train = []
loss_epoch_val = []
net = net.double()
for epoch in range(num_epochs):
loss_seq_train = []
loss_seq_val = []
# train loop
for seq, labels in train_loader:
seq, labels =,
# init hidden cell
net.hidden = net.init_hidden()
# Clear stored gradient

    y_pred_train = net(seq.double())                        
    # loss computation and backpropagation
    seq_loss = loss_function(y_pred_train, labels)    
    print('Epoch: ' + str(epoch+1) + ', Loss: ' + str(seq_loss.item()))     
# val loop    
for seq, labels in val_loader:                        
    seq, labels =,    
    # current model prediction
    y_pred_val = net(seq.double())                              
    # loss computation
    seq_loss = loss_function(y_pred_val, labels)        
# print loss of validation and training data for each epoch
print('Epoch '+str(epoch)+'/'+str(num_epochs)+': Train-Loss: '+str(np.round(loss_epoch_train[-1],4))+'; Val-Loss: '+str(np.round(loss_epoch_val[-1],4)))

Hey @john, is any part of this model using DataParallel, DistributedDataParallel, or torch.distributed.rpc? Any reason for tagging this question with “distributed-rpc”?

The format of the code looks distorted, and will be hard to debug. Could you please share a properly-formatted self-contained example?

please can i share my complete code with you because i really want to get it done correctly