Error
keyTraceback (most recent call last):
File "/home/sumeetbatra/PycharmProjects/RL Algorithms/PPO.py", line 142, in <module>
train(env, device)
File "/home/sumeetbatra/PycharmProjects/RL Algorithms/PPO.py", line 127, in train
loss.mean().backward()
File "/home/sumeetbatra/RL37/lib/python3.7/site-packages/torch/tensor.py", line 198, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/home/sumeetbatra/RL37/lib/python3.7/site-packages/torch/autograd/__init__.py", line 100, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: Function MulBackward0 returned an invalid gradient at index 1 - expected type TensorOptions(dtype=float, device=cuda:0, layout=Strided, requires_grad=false) but got TensorOptions(dtype=float, device=cpu, layout=Strided, requires_grad=false) (validate_outputs at /pytorch/torch/csrc/autograd/engine.cpp:484)
frame #0: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x46 (0x7efe02467536 in /home/sumeetbatra/RL37/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x2d83d24 (0x7efdcf3bad24 in /home/sumeetbatra/RL37/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #2: torch::autograd::Engine::evaluate_function(std::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffer&) + 0x548 (0x7efdcf3bc858 in /home/sumeetbatra/RL37/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #3: torch::autograd::Engine::thread_main(std::shared_ptr<torch::autograd::GraphTask> const&, bool) + 0x3d2 (0x7efdcf3be7e2 in /home/sumeetbatra/RL37/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #4: torch::autograd::Engine::thread_init(int) + 0x39 (0x7efdcf3b6e59 in /home/sumeetbatra/RL37/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #5: torch::autograd::python::PythonEngine::thread_init(int) + 0x38 (0x7efdfade9968 in /home/sumeetbatra/RL37/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #6: <unknown function> + 0xd6cb4 (0x7efddb7a8cb4 in /lib/x86_64-linux-gnu/libstdc++.so.6)
frame #7: <unknown function> + 0x9609 (0x7efe07eff609 in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #8: clone + 0x43 (0x7efe07ccc293 in /lib/x86_64-linux-gnu/libc.so.6)
Process finished with exit code 1
Code for Loss Function
optimizer.zero_grad()
rewards = torch.stack(rewards).squeeze()
vals = torch.stack(vals).squeeze()
observs = torch.stack(observs).squeeze()
acts = torch.stack(acts).squeeze()
log_probs = torch.stack(log_probs).squeeze()
adv = advantage(vals, rewards, gamma)
# old_acts = prev_model.get_action(observs)
old_logprobs = prev_model.get_policy(observs).log_prob(acts)
ratio = log_probs / old_logprobs
# surrogate losses
surr1 = ratio * adv
surr2 = torch.clamp(ratio, 1.0 - clip_eps, 1.0 + clip_eps) * adv
value_loss = value_loss_coef * (rewards - vals).pow(2)
entropy_rew = entropy_coef * model.get_policy(observs).entropy()
loss = -(torch.min(surr1, surr2) - value_loss + entropy_rew)
# update new and old policies
prev_model = type(model)(obs_dim, n_acts).to(device).eval()
prev_model.load_state_dict(model.state_dict())
loss.mean().backward()
optimizer.step()
Advantage Calculation
def advantage(values, rewards, gamma):
values = values.detach()
adv = torch.zeros((ROLLOUT, NUM_WORKERS)).to(device)
Q = rewards[-1]
adv[-1] = Q # at t = T, the values will cancel each other out!
for t in reversed(range(ROLLOUT-1)):
Q = rewards[t] + gamma * Q
adv[t] = Q - values[t]
return adv
Model Instantiation
obs_dim = env.observation_space.shape[0]
n_acts = env.action_space.shape[0]
model = ActorCritic(obs_dim, n_acts).to(device)
# save the previous policy to be used with the PPO update step
prev_model = type(model)(obs_dim, n_acts).to(device).eval()
prev_model.load_state_dict(model.state_dict())
ep_rewards = [0] * NUM_WORKERS
optimizer = optim.Adam(model.parameters(), lr=1e-5)
Model Code
class ActorCritic(nn.Module):
# actor-critic network for continuous action space
def __init__(self, n_obs, n_acts):
super(ActorCritic, self).__init__()
self.shared_params = nn.Sequential(
nn.Linear(n_obs, 32),
nn.ReLU(),
nn.Linear(32, 64),
nn.ReLU(),
nn.Linear(64, 32),
nn.ReLU(),
)
self.log_stds = nn.Parameter(torch.tensor(0.0), requires_grad=True)
self.policy = nn.Linear(32, n_acts)
self.value = nn.Linear(32, 1)
self.n_acts = n_acts
def forward(self, input):
x = self.shared_params(input)
policy = F.softmax(self.policy(x), dim=1)
value = self.value(x)
return policy, value
def get_action(self, obs):
logits, _ = self.forward(obs)
cov_matrix = (torch.eye(self.n_acts) * torch.exp(self.log_stds)).to(device)
action_dist = MultivariateNormal(logits, cov_matrix)
return action_dist.sample()
def get_policy(self, obs):
logits, _ = self.forward(obs)
cov_matrix = (torch.eye(self.n_acts) * torch.exp(self.log_stds)).to(device)
return MultivariateNormal(logits, cov_matrix)
Can’t seem to find which tensor is on cpu. Any ideas?