I am trying to implement an agent using proximal policy optimization. However, I am getting
C:\Users\Asus\anaconda3\lib\site-packages\torch\autograd\__init__.py:251: UserWarning: Error detected in AddmmBackward0. No forward pass information available. Enable detect anomaly during forward pass for more information. (Triggered internally at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\autograd\python_anomaly_mode.cpp:97.)
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
Traceback (most recent call last):
File "C:\Users\Asus\Desktop\RL_a\game.py", line 289, in <module>
agent.update_policy(states, actions, rewards, log_probs, values, next_values, dones)
File "C:\Users\Asus\Desktop\RL_a\ppox.py", line 95, in update_policy
loss.backward(retain_graph=True)
File "C:\Users\Asus\anaconda3\lib\site-packages\torch\_tensor.py", line 492, in backward
torch.autograd.backward(
File "C:\Users\Asus\anaconda3\lib\site-packages\torch\autograd\__init__.py", line 251, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [128, 4096]], which is output 0 of AsStridedBackward0, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
My code is
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import torch.nn.functional as F
import numpy as np
# Define the neural network for the policy
class PolicyNetwork(nn.Module):
def __init__(self, input_size, output_size):
super(PolicyNetwork, self).__init__()
self.fc = nn.Linear(input_size, 128)
self.fc2 = nn.Linear(128, output_size)
def forward(self, x):
x = torch.relu(self.fc(x))
x = self.fc2(x)
return torch.softmax(x, dim=-1)
# Define the Proximal Policy Optimization agent
class PPOAgent:
def __init__(self, input_size, output_size, lr=1e-3, gamma=0.99, epsilon=0.2, value_coef=0.5, entropy_coef=0.01):
self.policy = PolicyNetwork(input_size, output_size)
self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)
self.gamma = gamma
self.epsilon = epsilon
self.value_coef = value_coef
self.entropy_coef = entropy_coef
def select_action(self, state):
#print("state: ", state)
xstate = torch.from_numpy(state).float()
probs = self.policy(xstate)
m = Categorical(probs)
action = m.sample()
return action.item(), m.log_prob(action)
def update_policy(self, states, actions, rewards, log_probs, values, next_values, dones):
torch.autograd.set_detect_anomaly(True)
returns = self.compute_returns(rewards, dones)
"""
print("values", values, "next_values", next_values)
print("size of values", len(values[0]), "size of next_values", len(next_values))
print("type:",type(values[0]), type(values))
print("returns", returns)
print("size of returns", len(returns))
print("type:", type(returns))
"""
#convert values to tensor
xvalues = torch.tensor(values, requires_grad=True).float()
advantages = returns - xvalues
print("advantages: ", advantages)
for _ in range(ppo_epochs):
for i in range(len(states)):
state = torch.from_numpy(states[i]).float()
action = torch.tensor(actions[i])
old_log_prob = log_probs[i]
value = xvalues[i]
next_value = next_values[i]
advantage = advantages[i]
return_ = returns[i]
# Compute the new log probability and value
new_probs = self.policy(state)
new_log_prob = torch.log(new_probs[action].clone())
new_value = self.get_value(states[i])
# Compute the surrogate loss
ratio = torch.exp(new_log_prob - old_log_prob)
surr1 = ratio * advantage
surr2 = torch.clamp(ratio, 1 - self.epsilon, 1 + self.epsilon) * advantage
policy_loss = -torch.min(surr1, surr2).mean()
# Compute the value loss
value_loss = F.mse_loss(new_value, return_)
#print("lennnn: ",len(new_value), len(return_), len(advantage), len(ratio), len(surr1), len(surr2), len(policy_loss), len(value_loss), len(new_probs), len(new_log_prob), len(old_log_prob), len(advantage), len(advantages), len(returns), len(xvalues), len(value), len(next_value), len(action), len(state), len(states[i]), len(states), len(actions), len(log_probs), len(values), len(next_values), len(dones))
# Compute the entropy loss
entropy_loss = -torch.sum(new_probs * torch.log(new_probs + 1e-10))
# Total loss
loss = policy_loss + self.value_coef * value_loss - self.entropy_coef * entropy_loss
# Optimize the policy
self.optimizer.zero_grad()
print("loss: ", loss)
with torch.autograd.detect_anomaly():
loss.backward(retain_graph=True)
self.optimizer.step()
def compute_returns(self, rewards, dones):
returns = []
R = 0
for reward, done in zip(reversed(rewards), reversed(dones)):
if done:
R = 0
Rz = reward + self.gamma * R
returns.insert(0, Rz)
returnsx = torch.tensor(returns).float()
returnsy = (returnsx - returnsx.mean()) / (returnsx.std() + 1e-8)
return returnsy
def get_value(self, state):
print("ggstate: ", state)
statex = torch.from_numpy(state).float()
return self.policy(statex)
# Set your environment parameters
input_size = 64 # Assuming a flat representation of the chess board as input
output_size = 64*64 # Number of legal moves in your chess environment
# Initialize the PPO agent
agent = PPOAgent(input_size, output_size)
# Training loop
num_episodes = 100
ppo_epochs = 4
"""
for episode in range(num_episodes):
state = 0
done = False
states, actions, rewards, log_probs, values, next_values, dones = [], [], [], [], [], [], []
while not done:
action, log_prob = agent.select_action(state)
next_state, reward, done, _ = env.step(action)
states.append(state)
actions.append(action)
rewards.append(reward)
log_probs.append(log_prob)
values.append(agent.get_value(state))
next_values.append(agent.get_value(next_state))
dones.append(done)
state = next_state
agent.update_policy(states, actions, rewards, log_probs, values, next_values, dones)
"""
I tried to remove all in place operations but still get the same error.