Here is the code:
import torch
from tensordict.nn import TensorDictModule
from torch import nn
from torchrl.envs.libs.gym import GymEnv
from torchrl.envs.utils import check_env_specs
from torchrl.modules import ProbabilisticActor, ValueOperator, OneHotCategorical
from tqdm import tqdm
from tensordict import TensorDict
device = "cpu" if not torch.has_cuda else "cuda:0"
num_cells = 256
lr = 3e-4
max_grad_norm = 1.0
frame_skip = 1
frames_per_batch = 1000 // frame_skip
# For a complete training, bring the number of frames up to 1M
total_frames = 10_000 // frame_skip
sub_batch_size = 64 # cardinality of the sub-samples gathered from the current data in the inner loop
num_epochs = 10 # optimisation steps per batch of data collected
clip_epsilon = (
0.2 # clip value for PPO loss: see the equation in the intro for more context.
)
gamma = 0.99
lmbda = 0.95
entropy_eps = 1e-4
base_env = GymEnv("CarRacing-v2", device=device, frame_skip=frame_skip, continuous=False, from_pixels=True, pixels_only=True)
env = base_env
from torchrl.envs import Compose, ToTensorImage, TransformedEnv
transform = Compose(ToTensorImage(in_keys=["pixels"]))
env = TransformedEnv(env, transform)
n_actions = env.action_spec.space.n
print("observation_spec:", env.observation_spec)
print("reward_spec:", env.reward_spec)
print("done_spec:", env.done_spec)
print("action_spec:", env.action_spec)
print(check_env_specs(env))
rollout = env.rollout(10)
print("rollout of three steps:", rollout)
print("Shape of the rollout TensorDict:", rollout.batch_size)
actor_net = nn.Sequential(
nn.Linear(96, num_cells, device=device),
nn.Sigmoid(),
nn.Linear(num_cells, num_cells, device=device),
nn.Sigmoid(),
nn.Linear(num_cells, num_cells, device=device),
nn.Sigmoid(),
nn.Linear(num_cells, n_actions, device=device),
nn.Softmax()
)
policy_module = TensorDictModule(
actor_net, in_keys=["pixels"], out_keys=["action"]
)
policy_module = ProbabilisticActor(
module=policy_module,
spec=env.action_spec,
in_keys=["action"],
distribution_class=OneHotCategorical,
return_log_prob=True
)
value_net = nn.Sequential(
nn.LazyLinear(num_cells, device=device),
nn.Sigmoid(),
nn.LazyLinear(num_cells, device=device),
nn.Sigmoid(),
nn.LazyLinear(num_cells, device=device),
nn.Sigmoid(),
nn.LazyLinear(1, device=device),
)
value_module = ValueOperator(
module=value_net,
in_keys=["pixels"],
)
policy_module(env.reset())
Now when I do env.reset() within policy_module, I get the error:
TypeError: distribution keywords and tensordict keys indicated by ProbabilisticTensorDictModule.in_keys must match.Got this error message:
__init__() got an unexpected keyword argument 'action'
with in_keys={'action': 'action'}
But according to my limited knowledge, shouldn’t Probabilistic Actor module get actions as input and ouput the log probabilities of the actions? This is in a discrete environment. How to resolve this?