Problem with Backprop - Runtime Error: Expected TensorOptions(...) on cuda but got TensorOptions(...) on cpu with Function MulBackward0

Error

keyTraceback (most recent call last):
  File "/home/sumeetbatra/PycharmProjects/RL Algorithms/PPO.py", line 142, in <module>
    train(env, device)
  File "/home/sumeetbatra/PycharmProjects/RL Algorithms/PPO.py", line 127, in train
    loss.mean().backward()
  File "/home/sumeetbatra/RL37/lib/python3.7/site-packages/torch/tensor.py", line 198, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph)
  File "/home/sumeetbatra/RL37/lib/python3.7/site-packages/torch/autograd/__init__.py", line 100, in backward
    allow_unreachable=True)  # allow_unreachable flag
RuntimeError: Function MulBackward0 returned an invalid gradient at index 1 - expected type TensorOptions(dtype=float, device=cuda:0, layout=Strided, requires_grad=false) but got TensorOptions(dtype=float, device=cpu, layout=Strided, requires_grad=false) (validate_outputs at /pytorch/torch/csrc/autograd/engine.cpp:484)
frame #0: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x46 (0x7efe02467536 in /home/sumeetbatra/RL37/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x2d83d24 (0x7efdcf3bad24 in /home/sumeetbatra/RL37/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #2: torch::autograd::Engine::evaluate_function(std::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffer&) + 0x548 (0x7efdcf3bc858 in /home/sumeetbatra/RL37/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #3: torch::autograd::Engine::thread_main(std::shared_ptr<torch::autograd::GraphTask> const&, bool) + 0x3d2 (0x7efdcf3be7e2 in /home/sumeetbatra/RL37/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #4: torch::autograd::Engine::thread_init(int) + 0x39 (0x7efdcf3b6e59 in /home/sumeetbatra/RL37/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #5: torch::autograd::python::PythonEngine::thread_init(int) + 0x38 (0x7efdfade9968 in /home/sumeetbatra/RL37/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #6: <unknown function> + 0xd6cb4 (0x7efddb7a8cb4 in /lib/x86_64-linux-gnu/libstdc++.so.6)
frame #7: <unknown function> + 0x9609 (0x7efe07eff609 in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #8: clone + 0x43 (0x7efe07ccc293 in /lib/x86_64-linux-gnu/libc.so.6)

Process finished with exit code 1

Code for Loss Function

        optimizer.zero_grad()
        rewards = torch.stack(rewards).squeeze()
        vals = torch.stack(vals).squeeze()
        observs = torch.stack(observs).squeeze()
        acts = torch.stack(acts).squeeze()
        log_probs = torch.stack(log_probs).squeeze()
        adv = advantage(vals, rewards, gamma)

        # old_acts = prev_model.get_action(observs)
        old_logprobs = prev_model.get_policy(observs).log_prob(acts)
        ratio = log_probs / old_logprobs
        # surrogate losses
        surr1 = ratio * adv
        surr2 = torch.clamp(ratio, 1.0 - clip_eps, 1.0 + clip_eps) * adv
        value_loss = value_loss_coef * (rewards - vals).pow(2)
        entropy_rew = entropy_coef * model.get_policy(observs).entropy()
        loss = -(torch.min(surr1, surr2) - value_loss + entropy_rew)

        # update new and old policies
        prev_model = type(model)(obs_dim, n_acts).to(device).eval()
        prev_model.load_state_dict(model.state_dict())

        loss.mean().backward()
        optimizer.step()

Advantage Calculation

def advantage(values, rewards, gamma):
    values = values.detach()
    adv = torch.zeros((ROLLOUT, NUM_WORKERS)).to(device)
    Q = rewards[-1]
    adv[-1] = Q # at t = T, the values will cancel each other out!
    for t in reversed(range(ROLLOUT-1)):
        Q = rewards[t] + gamma * Q
        adv[t] = Q - values[t]
    return adv

Model Instantiation

    obs_dim = env.observation_space.shape[0]
    n_acts = env.action_space.shape[0]
    model = ActorCritic(obs_dim, n_acts).to(device)
    # save the previous policy to be used with the PPO update step
    prev_model = type(model)(obs_dim, n_acts).to(device).eval()
    prev_model.load_state_dict(model.state_dict())
    ep_rewards = [0] * NUM_WORKERS
    optimizer = optim.Adam(model.parameters(), lr=1e-5)

Model Code

class ActorCritic(nn.Module):
    # actor-critic network for continuous action space
    def __init__(self, n_obs, n_acts):
        super(ActorCritic, self).__init__()

        self.shared_params = nn.Sequential(
            nn.Linear(n_obs, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
        )
        self.log_stds = nn.Parameter(torch.tensor(0.0), requires_grad=True)
        self.policy = nn.Linear(32, n_acts)
        self.value = nn.Linear(32, 1)
        self.n_acts = n_acts

    def forward(self, input):
        x = self.shared_params(input)
        policy = F.softmax(self.policy(x), dim=1)
        value = self.value(x)
        return policy, value

    def get_action(self, obs):
        logits, _ = self.forward(obs)
        cov_matrix = (torch.eye(self.n_acts) * torch.exp(self.log_stds)).to(device)
        action_dist = MultivariateNormal(logits, cov_matrix)
        return action_dist.sample()

    def get_policy(self, obs):
        logits, _ = self.forward(obs)
        cov_matrix = (torch.eye(self.n_acts) * torch.exp(self.log_stds)).to(device)
        return MultivariateNormal(logits, cov_matrix)

Can’t seem to find which tensor is on cpu. Any ideas?

Could you post a minimal executable code snippet using your classes to reproduce this issue?
Also, which PyTorch version are you using at the moment? If you are not using the latest stable version (1.7.0), could you update to it?

I was on 1.4.0. After updating to 1.7.0, I got a different error message and was able to narrow it down to one of the tensors in get_policy that was still on cpu. Thanks!