Error when training an LSTM model for time-serie

errezeta · October 16, 2021, 12:51pm

Hi,
I’m trying to train an LSTM model for a time series problem, and I’m getting an error on the backward step. I’m using a custom dataset to load the data from a *.csv file and a dataloader.
This is my code:
LSTM Model:

class LSTM_RUL_Estimator(nn.Module):

    def __init__(self, n_features, hidden_dim, seq_length, num_layers=2, output_dim=1):
        super(LSTM_RUL_Estimator, self).__init__()
        self.hidden_dim = hidden_dim
        self.seq_length = seq_length
        self.num_layers = num_layers

        # Define the LSTM layers
        self.lstm = nn.LSTM(
            input_size=n_features,
            hidden_size=self.hidden_dim,
            num_layers=self.num_layers,
            batch_first=True,
            dropout=0.2
        )
        self.linear = nn.Linear(in_features=self.hidden_dim, out_features=output_dim)

    def forward(self, input):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.num_layers, input.size(0), self.hidden_dim).requires_grad_().to(device)
        # Initialize cell state
        c0 = torch.zeros(self.num_layers, input.size(0), self.hidden_dim).requires_grad_().to(device)

        lstm_out, (hn, cn) = self.lstm(input.float(), (h0.detach(), c0.detach()))
        pred = self.linear(lstm_out[:, -1, :])
        return pred

this is my “train” function:

# Training Function
def train_model(model, loss_function, optimizer, num_epochs=25):
    since = time.time()

    train_loss = []
    validation_loss = []

    for epoch in range(1, num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs))
        print('-' * 10)

        # Each epoch has a training phase and a validation phase at every 10 epochs
        for phase in ['train', 'test']:
            # Set model to training or evaluation mode
            model.train() if phase == 'train' else model.eval()

            # Iterate over data.
            for idx, (inputs, labels) in tqdm(enumerate(dataloaders[phase]),
                                              leave=True,
                                              total=len(dataloaders[phase])):
                inputs = inputs.to(device)
                labels = labels.to(device)
                labels = labels.float()

                # Pytorch accumulates gradients, we need to clear them out before each instance.
                model.zero_grad()

                if phase == 'train':
                    outputs = model(inputs)
                    loss = loss_function(outputs, labels)
                    train_loss.append(loss.item())
                    if epoch % 10 == 0:
                        print(f'Epoch {epoch} train loss: {loss.item()}')

                if phase == 'test':
                    with torch.no_grad():
                        outputs_val = model(inputs)
                        loss_val = loss_function(outputs_val, labels)
                    validation_loss.append(loss_val.item())
                    if epoch % 10 == 0:
                        print(f'Epoch {epoch} train loss: {loss.item()} val loss: {loss_val.item()}')

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
    return model, train_loss, validation_loss

when I call the train function I get the error:

loss.backward()
  File "C:\Miniconda3\envs\PyTorch\lib\site-packages\torch\_tensor.py", line 255, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "C:\Miniconda3\envs\PyTorch\lib\site-packages\torch\autograd\__init__.py", line 147, in backward
    Variable._execution_engine.run_backward(
RuntimeError: Trying to backward through the graph a second time (or directly access saved variables after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved variables after calling backward.

and if I use: loss.backward(retain_graph=True) then I get the error:

loss.backward(retain_graph=True)
  File "C:\Miniconda3\envs\PyTorch\lib\site-packages\torch\_tensor.py", line 255, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "C:\Miniconda3\envs\PyTorch\lib\site-packages\torch\autograd\__init__.py", line 147, in backward
    Variable._execution_engine.run_backward(
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [100, 1]], which is output 0 of TBackward, is at version 646; expected version 645 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

any help would be appreciated.