This is the error I have encountered
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [20, 80]], which is output 0 of TBackward, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
The functionality works with linear layers (basic ppo functionality obtained from https://github.com/higgsfield/RL-Adventure-2), however I have been trying to implement some LSTM layers which required me to modify the inputs to be sequence and some other changes. I’m not sure where the in-place operations are occurring that are messing up the gradient computation.
def add_states(self, actor_, envs, state, idx, entropy):
'''
add states to the batch of state, action, rewards ... etc pairs
input(s): state-tensor with current state, current index in the data set, entropy-constant representing entropy
'''
state = torch.FloatTensor(state)#.to(device)
if self.type_ == 'LSTM':
###create batch dimension
state = state.unsqueeze(1)
dist = actor_(state)
value = self.critic_(state)
action = dist.sample()
if self.discrete:
temp = action.numpy().item()
else:
temp = F.softmax(action, dim = 1)
temp = temp.numpy()[0]
if self.type_ == 'LSTM':
state = state.squeeze(1)[-1:, :]
reward, next_state = envs.step(temp, state.numpy(), idx)
log_prob = dist.log_prob(action)
entropy += dist.entropy().mean()
return log_prob.unsqueeze(1), value, torch.FloatTensor(np.array([reward])).unsqueeze(1), state, action.unsqueeze(1), next_state, entropy
def train(self, multiple_runs):
self.rewards_all = []
update_counter = 0
for i in range(multiple_runs):
if i == 0:
state, j = self.envs.initial_state()
else:
state, j = self.envs.initial_state(random = True)
max_idx = self.envs.data_df.shape[0] - j
idx = j + 1
while idx < max_idx:
log_probs = []
values = []
states = []
actions = []
rewards = []
entropy = 0
# self.actor_.eval()
# self.critic_.eval()
# with torch.no_grad():
if self.type_ == 'LSTM':
saved_cell_actor = (self.actor_.hidden_cell[0].clone(),self.actor_.hidden_cell[1].clone())
saved_cell_critic = (self.critic_.hidden_cell[0].clone(),self.critic_.hidden_cell[1].clone())
self.actor_.hidden_cell = (self.actor_.hidden_cell[0][:, -1:, :].clone(), self.actor_.hidden_cell[1][:, -1:, :].clone())
self.critic_.hidden_cell = (self.critic_.hidden_cell[0][:, -1:, :].clone(), self.critic_.hidden_cell[1][:, -1:, :].clone())
for k in range(self.seq_len-1):
states.append(torch.FloatTensor(state[k:k+1,:]))
for _ in range(self.num_steps):
if self.type_ == 'LSTM':
temp_state = state[1:, :]
log_prob, value, reward, state, action, next_state, entropy = self.add_states(self.actor_, self.envs, state, idx, entropy)
log_probs.append(log_prob)
values.append(value)
rewards.append(reward)#.to(device))
states.append(state)
actions.append(action)
state = next_state
if self.type_ == 'LSTM':
state = np.concatenate((temp_state, state), axis = 0)
self.rewards_all.append(reward)
idx += 1
if idx >= max_idx:
break
next_state = torch.FloatTensor(next_state)
if self.type_ == 'LSTM':
next_state = torch.FloatTensor(np.concatenate((state[1:, :], next_state), axis = 0)).unsqueeze(1)#.to(device)
next_value = self.critic_(next_state)
next_value = next_value.item()
returns = self.sf.compute_gae(next_value, rewards, values)
returns = torch.cat(returns).detach()
log_probs = torch.cat(log_probs).detach()
values = torch.cat(values).detach()
states = torch.cat(states)
actions = torch.cat(actions)
advantage = returns - values
if self.type_ == 'LSTM':
self.actor_.hidden_cell = saved_cell_actor
self.critic_.hidden_cell = saved_cell_critic
# self.actor_.train()
# self.critic_.train()
update_counter += 1
if update_counter % 3 == 0:
print('{} % done.'.format((idx/max_idx)*100))
self.sf.ppo_update(states, actions, log_probs, returns, advantage, self.actor_, self.critic_, self.optimizer_actor, self.optimizer_critic)
def ppo_update(self, states, actions, log_probs, returns, advantages, actor_, critic_, optimizer_actor, optimizer_critic, clip_param=0.2):
'''
training loop: iterate through batches of states, actions, returns and advantages
'''
for _ in range(self.ppo_epochs):
for state, action, old_log_probs, return_, advantage in self.ppo_iter(states, actions, log_probs, returns, advantages):
#with torch.autograd.set_detect_anomaly(True):
###obtain outputs from neural nets
dist = actor_(state)
value = critic_(state)
###calculate entropy and log_prob
entropy = dist.entropy().mean()
new_log_probs = dist.log_prob(action)
# print(action.size())
# print(new_log_probs.size())
# print(advantage.size())
###calculate ppo ratio from paper
ratio = (new_log_probs - old_log_probs).exp()
surr1 = ratio * advantage
surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage
###actor and critic losses
actor_loss = - torch.min(surr1, surr2).mean() - 0.001 * entropy
mse_loss = nn.MSELoss()
critic_loss = mse_loss(value, return_) - 0.001 * entropy
print(actor_loss.grad_fn)
#actor_loss.register_hook(lambda grad: print(grad))
###update
optimizer_actor.zero_grad()
optimizer_critic.zero_grad()
actor_loss.backward(retain_graph=True)
critic_loss.backward(retain_graph=True)
optimizer_actor.step()
optimizer_critic.step()
def f(self, temp_tensor):
'''
helper function for building out the sequences
'''
temp = [temp_tensor[j:j+self.seq_len, :].clone().unsqueeze(1) for j in range(self.mini_batch_size)]
# for i in temp:
# print(i.size())
return torch.cat(temp, 1)
def ppo_iter(self, states, actions, log_probs, returns, advantage):
'''
iterate through a set of the data using minibatches
input(s):
'''
batch_size = states.size(0)
for _ in range(batch_size // self.mini_batch_size):
#rand_ids = np.random.randint(0, batch_size, mini_batch_size)
#lump = self.mini_batch_size + self.seq_len
rand_ids = np.random.randint(self.seq_len-1, batch_size - self.mini_batch_size)
#yield states[rand_ids, :], actions[rand_ids, :], log_probs[rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]
yield self.f(states[rand_ids-self.seq_len+1:rand_ids+self.mini_batch_size:, :]), actions[rand_ids:rand_ids+self.mini_batch_size, :], log_probs[rand_ids:rand_ids+self.mini_batch_size, :], returns[rand_ids:rand_ids+self.mini_batch_size, :], advantage[rand_ids:rand_ids+self.mini_batch_size, :]
def set_up_layers(self, state_dim, action_dim):
'''
make structure for the actor network
'''
if self.type_ == 'linear':
layers = base_net(state_dim, self.dim_1, self.dim_2, self.type_, self.dropout)
layers.append(nn.Linear(self.dim_2, action_dim))
self.model = nn.Sequential(*layers)
elif self.type_ == 'LSTM':
self.lstm = nn.LSTM(state_dim, hidden_size = self.hidden_dim)
self.lin_cap = nn.Linear(self.hidden_dim, action_dim)
def forward(self, state):
'''
forward propagate the network
input(s): state
output(s): result of the actor network
'''
out = self.model(state)
if self.discrete:
probs = F.softmax(out, dim = 1)
return Categorical(probs)
else:
mu = self.model(state)
std = self.log_std.exp().expand_as(mu)
dist = Normal(mu, std)
return dist
class actor_LSTM(actor):
'''
LSTM actor class
need to overwrite the forward function
'''
def __init__(self, state_dim, action_dim, params = default_params):
super(actor_LSTM, self).__init__(state_dim, action_dim, params = params)
self.reset_hidden_cell()
def reset_hidden_cell(self):
'''
resets the hidden cell state, or intializes it
'''
self.hidden_cell = (torch.zeros(1,self.mini_batch_size,self.hidden_dim), torch.zeros(1,self.mini_batch_size,self.hidden_dim))
def forward(self, x):
lstm_out, self.hidden_cell = self.lstm(x, self.hidden_cell)
mu = self.lin_cap(lstm_out[-1])
if self.discrete:
probs = F.softmax(mu, dim = 1)
return Categorical(probs)
else:
std = self.log_std.exp().expand_as(mu)
dist = Normal(mu, std)
return dist
Thanks in advance