Hi,
I’m trying to train an LSTM model for a time series problem, and I’m getting an error on the backward
step. I’m using a custom dataset to load the data from a *.csv file and a dataloader.
This is my code:
LSTM Model:
class LSTM_RUL_Estimator(nn.Module):
def __init__(self, n_features, hidden_dim, seq_length, num_layers=2, output_dim=1):
super(LSTM_RUL_Estimator, self).__init__()
self.hidden_dim = hidden_dim
self.seq_length = seq_length
self.num_layers = num_layers
# Define the LSTM layers
self.lstm = nn.LSTM(
input_size=n_features,
hidden_size=self.hidden_dim,
num_layers=self.num_layers,
batch_first=True,
dropout=0.2
)
self.linear = nn.Linear(in_features=self.hidden_dim, out_features=output_dim)
def forward(self, input):
# Initialize hidden state with zeros
h0 = torch.zeros(self.num_layers, input.size(0), self.hidden_dim).requires_grad_().to(device)
# Initialize cell state
c0 = torch.zeros(self.num_layers, input.size(0), self.hidden_dim).requires_grad_().to(device)
lstm_out, (hn, cn) = self.lstm(input.float(), (h0.detach(), c0.detach()))
pred = self.linear(lstm_out[:, -1, :])
return pred
this is my “train” function:
# Training Function
def train_model(model, loss_function, optimizer, num_epochs=25):
since = time.time()
train_loss = []
validation_loss = []
for epoch in range(1, num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs))
print('-' * 10)
# Each epoch has a training phase and a validation phase at every 10 epochs
for phase in ['train', 'test']:
# Set model to training or evaluation mode
model.train() if phase == 'train' else model.eval()
# Iterate over data.
for idx, (inputs, labels) in tqdm(enumerate(dataloaders[phase]),
leave=True,
total=len(dataloaders[phase])):
inputs = inputs.to(device)
labels = labels.to(device)
labels = labels.float()
# Pytorch accumulates gradients, we need to clear them out before each instance.
model.zero_grad()
if phase == 'train':
outputs = model(inputs)
loss = loss_function(outputs, labels)
train_loss.append(loss.item())
if epoch % 10 == 0:
print(f'Epoch {epoch} train loss: {loss.item()}')
if phase == 'test':
with torch.no_grad():
outputs_val = model(inputs)
loss_val = loss_function(outputs_val, labels)
validation_loss.append(loss_val.item())
if epoch % 10 == 0:
print(f'Epoch {epoch} train loss: {loss.item()} val loss: {loss_val.item()}')
optimizer.zero_grad()
loss.backward()
optimizer.step()
return model, train_loss, validation_loss
when I call the train function I get the error:
loss.backward()
File "C:\Miniconda3\envs\PyTorch\lib\site-packages\torch\_tensor.py", line 255, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "C:\Miniconda3\envs\PyTorch\lib\site-packages\torch\autograd\__init__.py", line 147, in backward
Variable._execution_engine.run_backward(
RuntimeError: Trying to backward through the graph a second time (or directly access saved variables after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved variables after calling backward.
and if I use: loss.backward(retain_graph=True)
then I get the error:
loss.backward(retain_graph=True)
File "C:\Miniconda3\envs\PyTorch\lib\site-packages\torch\_tensor.py", line 255, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "C:\Miniconda3\envs\PyTorch\lib\site-packages\torch\autograd\__init__.py", line 147, in backward
Variable._execution_engine.run_backward(
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [100, 1]], which is output 0 of TBackward, is at version 646; expected version 645 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).
any help would be appreciated.