PPO with Categorical Action... help

Jean-Marc_Fauche · August 8, 2025, 1:00am

Hello, I’m looking for examples or tutorials on PPO using torchrl with Categorical Actions for a custom Env. I’ve tried to adapt the inverted-double-pendulum tutorial but without success. If anyone can help me I would be very grateful…

ptrblck · August 8, 2025, 8:18pm

Could you describe what exactly you’ve tried so far and where you were stuck?

This could help others to chime in suggesting potential next steps etc.

Jean-Marc_Fauche · August 9, 2025, 9:08am

Ok, first here is a simple Custom Env called “CosTrader” the idea is a cosinus function is running in time, an agent “the Trader” has to learn how to buy at down and close at up. The agent has a categorical action with shape = 1 and n=3 (he take only one action at time and can 0= close position, 1= do nothing, 2= open a buy position). Of course it is not for financial purpose but for a learning purpose! Later I want to introduce noise, varying level(k in params), etc… Nb:I have to make a mask on action but not implemented.
So here is the code “CosTrader.py”:

import torch
from tensordict.tensordict import TensorDict, TensorDictBase
from torchrl.data import Bounded, Composite, Unbounded, Categorical
from torchrl.envs import EnvBase
from torchrl.envs.utils import check_env_specs

from typing import Optional
import random

#############
rendering = True
#############
if rendering:
    import pygame
    from pygame import gfxdraw

def gen_params(batch_size=None) -> TensorDictBase:
    if batch_size is None:
        batch_size = []
    # f(x) = a.cos(b(x-h))+k
    td = TensorDict({"params": TensorDict({"a": 0.2, # ampli
                                           "b": 1, # freq
                                           "h": random.uniform(0,1), # phase
                                           "k": 1.25, # niveau
                                           "dt": 0.05},
                                          [],) },
                    [],)

    if batch_size:
        td = td.expand(batch_size).contiguous()
    return td

def make_composite_from_td(td):
    # custom funtion to convert a tensordict in a similar spec structure
    # of unbounded values.
    composite = Composite({key: make_composite_from_td(tensor) if isinstance(tensor, TensorDictBase)
                               else Unbounded( dtype=tensor.dtype,
                                               device=tensor.device,
                                               shape=tensor.shape )
                               for key, tensor in td.items()},
                               shape=td.shape )
    return composite


class CosTraderEnv(EnvBase):
    batch_locked = False
    AUTO_UNWRAP_TRANSFORMED_ENV = False
    
    
    def __init__(self, td_params=None, seed=None, device="cpu", batch_size = []):
        super().__init__(device=device, batch_size=batch_size)
        
        if td_params is None:
            td_params = self.gen_params(batch_size)
        
        self._make_spec(td_params)
        if seed is None:
            seed = torch.empty((), dtype=torch.int64).random_().item()
        self.set_seed(seed)

        self.screen = None
        self.clock = None

    def _step(self,tensordict):
        a = tensordict["params", "a"]
        b = tensordict["params", "b"]
        h = tensordict["params", "h"]
        k = tensordict["params", "k"]
        dt = tensordict["params", "dt"]

        bid   = tensordict["bid"]      # cours
        angle = tensordict["angle"]    # tg à la courbe
        posA  = tensordict["posA"]     # position Achat
        solde = tensordict["solde"]    # solde
        t     = tensordict["t"]        # temps
        
        u = tensordict["action"].squeeze(-1)
        #action:  0 = close,   1 = rien,    2 = Achat

        C = torch.where(u == 0, 1, 0)
        A = torch.where(u == 2, 1, 0)
        R = torch.where(u == 1, 1, 0)
        
        no_pos = torch.where(posA == 0, 1, 0)
        in_pos = torch.where(posA >  0, 1, 0)
        add = torch.where(u == 2, bid*no_pos, 0)        
        new_posA = posA + add
        new_posA = torch.where(u == 0, 0, new_posA)
        
        new_t = t + dt        
        new_bid = a*(b*(t-h)).cos()+k  # f(x) = a.cos(b(x-h))+k
        new_angle = (new_bid - bid)/dt
        new_solde = new_bid - posA

        ####
        #reward = torch.zeros_like(u, dtype=torch.float32)
        reward_C = C* in_pos *(bid - posA)
        reward_R = (R* in_pos - R* no_pos) *(new_bid-bid)
        reward_A = A* no_pos *(new_bid-bid)
        reward = reward_C + reward_R + reward_A
        
        ####
        
        done = torch.zeros_like(reward, dtype=torch.bool)
        
        nextTD = TensorDict({"params": tensordict["params"],
                             "angle": new_angle,
                             "bid": new_bid,
                             "posA": new_posA,
                             "t": new_t,
                             "solde":new_solde,
                             "reward": reward,
                             "done": done,},
                            tensordict.shape,)

        if rendering:
            self.state = (angle.tolist(),bid.tolist(),new_posA.tolist())
            self.last_u = 0
            self.render()
        
        return nextTD


    def _reset(self, tensordict):
        if tensordict is None or tensordict.is_empty():
            tensordict = self.gen_params(batch_size=self.batch_size)
        t = torch.zeros(tensordict.shape, device = self.device)
        posA = torch.zeros(tensordict.shape, device = self.device)
        solde= torch.zeros(tensordict.shape, device = self.device)
        a = tensordict["params", "a"]
        b = tensordict["params", "b"]
        h = tensordict["params", "h"]
        k = tensordict["params", "k"]
        dt = tensordict["params", "dt"]
        
        bid =  a* (b*(t-h)).cos() + k #a.cos(b(x-h))+k
        angle = torch.zeros(tensordict.shape, device = self.device) #...

        out = TensorDict({"params": tensordict["params"],
                          "angle": angle,
                          "bid": bid,
                          "posA": posA,
                          "t": t,
                          "solde":solde},
                         batch_size=tensordict.shape,)

        if rendering:
            self.last_u = None 
            self.state = (angle.tolist(),bid.tolist(),posA.tolist())
            self.render()

        return out

        
    def _make_spec(self,td_params):
        # Under the hood, this will populate self.output_spec["observation"]
        self.observation_spec = Composite(angle=Bounded(low = -torch.pi/2,
                                                        high = torch.pi/2,
                                                        shape = (),
                                                        dtype = torch.float32),
                                          bid=Unbounded(shape = (),
                                                        dtype = torch.float32),
                                          posA=Unbounded(shape = (),
                                                         dtype = torch.float32),
                                          t=Bounded(low = 0,
                                                    high = 1000,
                                                    shape = (),
                                                    dtype = torch.float32),
                                          solde=Unbounded(shape = (),
                                                          dtype = torch.float32),
                                          params=make_composite_from_td(td_params["params"]),
                                          shape= ())
        
        self.state_spec = self.observation_spec.clone()
        
        self.action_spec = Categorical(n = 3, shape = (*td_params.shape,1))
        
        self.reward_spec = Unbounded(shape = (*td_params.shape, 1))
        
    gen_params = staticmethod(gen_params)
    
    def _set_seed(self, seed: Optional[int]):
        rng = torch.manual_seed(seed)
        self.rng = rng

    def get_obskeys(self):
        return ["angle", "bid","posA","t","solde"]

    def render(self):
        
        if self.screen is None:
            self.screen_dim = 400
            pygame.init()
            pygame.display.init()
            self.screen = pygame.display.set_mode((self.screen_dim,self.screen_dim))
        if self.clock is None:
            self.clock = pygame.time.Clock()

        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()
                raise SystemExit

        self.surf = pygame.Surface((self.screen_dim, self.screen_dim))
        self.surf.fill((255, 255, 255))

        bound = 2.2
        scale = self.screen_dim / (bound * 2)
        offset = self.screen_dim // 2

        rod_length = 1 * scale
        rod_width = 0.02 * scale
        l, r, t, b = 0, rod_length, rod_width / 2, -rod_width / 2
        coords = [(l, b), (l, t), (r, t), (r, b)]
        transformed_coords = []
        for c in coords:
            try:
                c = pygame.math.Vector2(c).rotate_rad(self.state[0])# + np.pi / 2)
            except:
                c = pygame.math.Vector2(c).rotate_rad(self.state[0][0])# + np.pi / 2)
            c = (c[0] + offset, c[1] + offset)
            transformed_coords.append(c)
        gfxdraw.aapolygon(self.surf, transformed_coords, (204, 77, 77))
        gfxdraw.filled_polygon(self.surf, transformed_coords, (204, 77, 77))

        gfxdraw.aacircle(self.surf, offset, offset, int(rod_width / 2), (204, 77, 77))
        gfxdraw.filled_circle(
            self.surf, offset, offset, int(rod_width / 2), (204, 77, 77)
        )

        # drawing bid
        
        try:
            bid = self.state[1]
            gfxdraw.filled_circle(self.surf, 5,int(9*(bid-1)* scale), int(0.05 * scale), (0, 0, 0))
        except:
            bid = self.state[1][0]
            #gfxdraw.aacircle(self.surf, 5, int(9*(bid-1)* scale), int(0.05 * scale), (0, 0, 0))
            gfxdraw.filled_circle(self.surf, 5,int(9*(bid-1)* scale), int(0.05 * scale), (0, 0, 0))



         # drawing posA        
        try:
            vac = self.state[2]
            gfxdraw.filled_circle(self.surf, self.screen_dim-5,int(9*(vac-1)* scale), int(0.05 * scale), (0, 255, 0))
        except:
            vac = self.state[2][0]
            #gfxdraw.aacircle(self.surf, 5, int(9*(bid-1)* scale), int(0.05 * scale), (0, 0, 0))
            gfxdraw.filled_circle(self.surf, self.screen_dim-5,int(9*(vac-1)* scale), int(0.05 * scale), (0, 255, 0))

        #print(bid,vac)
        self.surf = pygame.transform.flip(self.surf, False, True)
        self.screen.blit(self.surf, (0, 0))
       
        
        self.clock.tick(30)
        pygame.display.flip()

    def close(self):
        if self.screen is not None:
            pygame.display.quit()
            pygame.quit()



if __name__ == "__main__":
    ### Tests ###
    env = CosTraderEnv()#batch_size = torch.Size([8]))
    check_env_specs(env)
    print("\n* observation_spec:", env.observation_spec)
    print("\n* action_spec:", env.action_spec)
    print("\n* reward_spec:", env.reward_spec)
    
    print("\n* random action 5: \n", env.action_spec.rand(torch.Size([5])))
    
    td = env.reset()
    print("\n* reset tensordict", td)

    td = env.rand_step(td)
    print("\n* random step tensordict", td)
    env.close()

Jean-Marc_Fauche · August 9, 2025, 9:09am

Secondly I want to train my agent with a PPO (and honestly I still have troubles understanding all the workings of it), so I tried to adapt the “DoubleInvertedPendulum” witch is a continuous action.
Here is the code “CosPPO.py” in progress (but not quickly…)

from collections import defaultdict

import matplotlib.pyplot as plt
import torch
from tensordict.nn import TensorDictModule
from tensordict.nn.distributions import NormalParamExtractor
from torch import nn
from torch.distributions import Normal

from torchrl.collectors import SyncDataCollector
from torchrl.data.replay_buffers import ReplayBuffer
from torchrl.data.replay_buffers.samplers import SamplerWithoutReplacement
from torchrl.data.replay_buffers.storages import LazyTensorStorage
from torchrl.envs import (
    Compose,
    DoubleToFloat,
    ObservationNorm,
    StepCounter,
    TransformedEnv,
    UnsqueezeTransform,
    CatTensors,
)
from torchrl.envs.libs.gym import GymEnv
from torchrl.envs.utils import check_env_specs, ExplorationType, set_exploration_type
from torchrl.modules import ProbabilisticActor, TanhNormal, ValueOperator
from torchrl.objectives import ClipPPOLoss
from torchrl.objectives.value import GAE
#from tqdm import tqdm


#is_fork = multiprocessing.get_start_method() == "fork"
device = (torch.device("cpu"))
"""          
    torch.device(0)
    if torch.cuda.is_available() and not is_fork
    else torch.device("cpu"))
"""    
num_cells = 256  # number of cells in each layer i.e. output dim.
lr = 3e-4
max_grad_norm = 1.0


frames_per_batch = 1000
# For a complete training, bring the number of frames up to 1M
total_frames = 10_000


sub_batch_size = 64  # cardinality of the sub-samples gathered from the current data in the inner loop
num_epochs = 10  # optimization steps per batch of data collected
clip_epsilon = (
    0.2  # clip value for PPO loss: see the equation in the intro for more context.
)
gamma = 0.99
lmbda = 0.95
entropy_eps = 1e-4


import CosTrader
env = CosTrader.CosTraderEnv()#batch_size = torch.Size([8]))
# Transform ******************
obs_keys = env.get_obskeys()
env = TransformedEnv(
    env,
    # Unsqueezes the observations that we will concatenate    
    UnsqueezeTransform(
        dim=-1,
        in_keys=obs_keys,
        in_keys_inv=obs_keys,
    ),
)
cat_transform = CatTensors(
    in_keys=obs_keys, dim=-1, out_key="observation", del_keys=False
)
env.append_transform(cat_transform)
env = TransformedEnv(
    env,
    Compose(
        # normalize observations
        ObservationNorm(in_keys=["observation"]),
        DoubleToFloat(),
        StepCounter(),
    ),
)

# ******************************


env.transform[-3].init_stats(num_iter=1000, reduce_dim=0, cat_dim=0)


print("normalization constant shape:", env.transform[-3].loc.shape)

print("observation_spec:", env.observation_spec)
print("reward_spec:", env.reward_spec)
print("input_spec:", env.input_spec)
print("action_spec (as defined by input_spec):", env.action_spec)

check_env_specs(env)

rollout = env.rollout(3)
print("rollout of three steps:", rollout)
print("Shape of the rollout TensorDict:", rollout.batch_size)

######################################################################
# Batching computations

batch_size = 10   
td = env.reset(env.gen_params(batch_size=[batch_size]))
print("\n* reset (batch size of 10)", td)

td = env.rand_step(td)
print("\n* rand step (batch size of 10)", td)


# Hyperparamètres ---------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_cells = 64  # number of cells in each layer i.e. output dim.
lr = 2e-3
max_grad_norm = 1.0
# -------------------------------------------------------------

policy_net = nn.Sequential(nn.LazyLinear(num_cells, device=device),
                          nn.Tanh(),
                          nn.LazyLinear(num_cells, device=device),
                          nn.Tanh(),
                          nn.LazyLinear(num_cells, device=device),
                          nn.Tanh(),
                          nn.LazyLinear(2 * env.action_spec.shape[-1], device=device),
                          NormalParamExtractor())

policy_module = TensorDictModule(policy_net,
                                 in_keys = ["observation"],
                                 out_keys = ["loc", "scale"])

policy_module = ProbabilisticActor(module=policy_module,
                                   spec = env.action_spec,
                                   in_keys = ["loc", "scale"],
                                   out_keys= ["action"],
                                   distribution_class = Normal,#TanhNormal,
                                   #distribution_kwargs = {"min": -1, "max": 1},
                                   return_log_prob = True)

critic_net = nn.Sequential(nn.LazyLinear(num_cells, device=device),
                          nn.Tanh(),
                          nn.LazyLinear(num_cells, device=device),
                          nn.Tanh(),
                          nn.LazyLinear(num_cells, device=device),
                          nn.Tanh(),
                          nn.LazyLinear(1, device=device))

critic = ValueOperator(module = critic_net,
                             in_keys = ["observation"])

print("\n* Running policy:", policy_module(env.reset()))
print("\n* Running criticvalue:", critic(env.reset()))


# Data paramètres ---------------------------------------------
frame_skip = 1  
frames_per_batch = 150 // frame_skip
total_frames = 40_050 // frame_skip
minibatch_size = 64
# -------------------------------------------------------------

collector = SyncDataCollector(env,
                              policy_module,
                              frames_per_batch=frames_per_batch,
                              total_frames=total_frames,
                              split_trajs=False,
                              device=device,)


replay_buffer = ReplayBuffer(storage = LazyTensorStorage(frames_per_batch),
                             sampler = SamplerWithoutReplacement(),
                             batch_size = minibatch_size)

# PPO paramètres ---------------------------------------------
num_epochs = 10  # optimization steps per batch of data collected
clip_epsilon = (0.2)  # clip value for PPO loss: see the equation in the intro for more context.
gamma = 0.99 # [0 ,1]
lmbda = 0.95 # [0 ,1]
entropy_eps = 1e-4
# -------------------------------------------------------------

advantage_module = GAE(gamma = gamma,
                       lmbda = lmbda,
                       value_network = critic,
                       average_gae = True)

loss_module = ClipPPOLoss(actor = policy_module,
                          critic = critic,
                          advantage_key = "advantage",
                          clip_epsilon = clip_epsilon,
                          entropy_bonus = bool(entropy_eps),
                          entropy_coeff = entropy_eps,
                          # these keys match by default but we set this for completeness
                          value_target_key = advantage_module.value_target_key,
                          critic_coeff = 1.0,
                          gamma = gamma,
                          loss_critic_type = "smooth_l1",)

optim = torch.optim.Adam(loss_module.parameters(), lr)

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, total_frames // frames_per_batch, 0.0)

logs = defaultdict(list)

total=total_frames * frame_skip
faits = 0
eval_str = ""

print("TRAINING LOOP ***********************************************************")
for i, tensordict_data in enumerate(collector):
    # we now have a batch of data to work with. Let's learn something from it.
    #print(tensordict_data["action"])
    for _ in range(num_epochs):
        # We'll need an "advantage" signal to make PPO work.
        # We re-compute it at each epoch as its value depends on the value
        # network which is updated in the inner loop.
        advantage_module(tensordict_data)
        
        data_view = tensordict_data.reshape(-1)
        
        replay_buffer.extend(data_view.cpu())
        
        for _ in range(frames_per_batch // minibatch_size):
            subdata = replay_buffer.sample(minibatch_size)
            loss_vals = loss_module(subdata.to(device))
            loss_value = (loss_vals["loss_objective"] + loss_vals["loss_critic"] + loss_vals["loss_entropy"])

            # Optimization: backward, grad clipping and optimization step
            loss_value.backward()
            
            # this is not strictly mandatory but it's good practice to keep gradient norm bounded
            torch.nn.utils.clip_grad_norm_(loss_module.parameters(), max_grad_norm)
            optim.step()
            optim.zero_grad()

    logs["reward"].append(tensordict_data["next", "reward"].mean().item())
    faits += tensordict_data.numel() * frame_skip
    cum_reward_str = (f"average reward={logs['reward'][-1]: 4.4f}")# (init={logs['reward'][0]: 4.4f})")
    #logs["step_count"].append(tensordict_data["step_count"].max().item())
    #stepcount_str = f"step count (max): {logs['step_count'][-1]}"
    logs["lr"].append(optim.param_groups[0]["lr"])
    lr_str = f"lr policy: {logs['lr'][-1]: 4.4f}"
    if i % 10 == 0:
        with set_exploration_type(ExplorationType.DETERMINISTIC), torch.no_grad():
            # execute a rollout with the trained policy
            eval_rollout = env.rollout(1000, policy_module)
            logs["eval reward"].append(eval_rollout["next", "reward"].mean().item())
            logs["eval reward (sum)"].append(eval_rollout["next", "reward"].sum().item())
            #logs["eval step_count"].append(eval_rollout["step_count"].max().item())
            eval_str = (f"cum reward: {logs['eval reward (sum)'][-1]: 4.4f} "
                        #f"step-count: {logs['eval step_count'][-1]}"
                        )
            
            del eval_rollout
    collector.update_policy_weights_() ## ??
    progres = 100*faits/total
    print(f"{progres: 3.1f}%",", ".join([eval_str, cum_reward_str, lr_str]))
    # We're also using a learning rate scheduler. Like the gradient clipping,
    # this is a nice-to-have but nothing necessary for PPO to work.
    scheduler.step()

Jean-Marc_Fauche · August 9, 2025, 9:10am

when running code here are the tests:

(IAvenv) fauche@debian:~/Documents/RenforcementLearning$ python CosPPO.py
pygame 2.6.1 (SDL 2.28.4, Python 3.11.2)
Hello from the pygame community. https://www.pygame.org/contribute.html
/home/fauche/Documents/IAvenv/lib/python3.11/site-packages/torchrl/envs/transforms/transforms.py:831: FutureWarning: The default behavior of TransformedEnv will change in version 0.9. Nested TransformedEnvs will no longer be automatically unwrapped by default. To prepare for this change, use set_auto_unwrap_transformed_env(val: bool) as a decorator or context manager, or set the environment variable AUTO_UNWRAP_TRANSFORMED_ENV to 'False'.
  instance: EnvBase = super(_EnvPostInit, self).__call__(*args, **kwargs)
normalization constant shape: torch.Size([5])
observation_spec: Composite(
    angle: BoundedContinuous(
        shape=torch.Size([1]),
        space=ContinuousBox(
            low=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True),
            high=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True)),
        device=cpu,
        dtype=torch.float32,
        domain=continuous),
    bid: UnboundedContinuous(
        shape=torch.Size([1]),
        space=ContinuousBox(
            low=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True),
            high=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True)),
        device=cpu,
        dtype=torch.float32,
        domain=continuous),
    posA: UnboundedContinuous(
        shape=torch.Size([1]),
        space=ContinuousBox(
            low=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True),
            high=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True)),
        device=cpu,
        dtype=torch.float32,
        domain=continuous),
    t: BoundedContinuous(
        shape=torch.Size([1]),
        space=ContinuousBox(
            low=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True),
            high=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True)),
        device=cpu,
        dtype=torch.float32,
        domain=continuous),
    solde: UnboundedContinuous(
        shape=torch.Size([1]),
        space=ContinuousBox(
            low=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True),
            high=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True)),
        device=cpu,
        dtype=torch.float32,
        domain=continuous),
    params: Composite(
        a: UnboundedContinuous(
            shape=torch.Size([]),
            space=ContinuousBox(
                low=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, contiguous=True),
                high=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, contiguous=True)),
            device=cpu,
            dtype=torch.float32,
            domain=continuous),
        b: UnboundedDiscrete(
            shape=torch.Size([]),
            space=ContinuousBox(
                low=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.int64, contiguous=True),
                high=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.int64, contiguous=True)),
            device=cpu,
            dtype=torch.int64,
            domain=discrete),
        h: UnboundedContinuous(
            shape=torch.Size([]),
            space=ContinuousBox(
                low=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, contiguous=True),
                high=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, contiguous=True)),
            device=cpu,
            dtype=torch.float32,
            domain=continuous),
        k: UnboundedContinuous(
            shape=torch.Size([]),
            space=ContinuousBox(
                low=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, contiguous=True),
                high=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, contiguous=True)),
            device=cpu,
            dtype=torch.float32,
            domain=continuous),
        dt: UnboundedContinuous(
            shape=torch.Size([]),
            space=ContinuousBox(
                low=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, contiguous=True),
                high=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, contiguous=True)),
            device=cpu,
            dtype=torch.float32,
            domain=continuous),
        device=cpu,
        shape=torch.Size([]),
        data_cls=None),
    observation: UnboundedContinuous(
        shape=torch.Size([5]),
        space=ContinuousBox(
            low=Tensor(shape=torch.Size([5]), device=cpu, dtype=torch.float32, contiguous=True),
            high=Tensor(shape=torch.Size([5]), device=cpu, dtype=torch.float32, contiguous=True)),
        device=cpu,
        dtype=torch.float32,
        domain=continuous),
    step_count: BoundedDiscrete(
        shape=torch.Size([1]),
        space=ContinuousBox(
            low=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.int64, contiguous=True),
            high=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.int64, contiguous=True)),
        device=cpu,
        dtype=torch.int64,
        domain=discrete),
    device=cpu,
    shape=torch.Size([]),
    data_cls=None)
reward_spec: UnboundedContinuous(
    shape=torch.Size([1]),
    space=ContinuousBox(
        low=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True),
        high=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True)),
    device=cpu,
    dtype=torch.float32,
    domain=continuous)
input_spec: Composite(
    full_state_spec: Composite(
        angle: BoundedContinuous(
            shape=torch.Size([1]),
            space=ContinuousBox(
                low=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True),
                high=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True)),
            device=cpu,
            dtype=torch.float32,
            domain=continuous),
        bid: UnboundedContinuous(
            shape=torch.Size([1]),
            space=ContinuousBox(
                low=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True),
                high=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True)),
            device=cpu,
            dtype=torch.float32,
            domain=continuous),
        posA: UnboundedContinuous(
            shape=torch.Size([1]),
            space=ContinuousBox(
                low=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True),
                high=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True)),
            device=cpu,
            dtype=torch.float32,
            domain=continuous),
        t: BoundedContinuous(
            shape=torch.Size([1]),
            space=ContinuousBox(
                low=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True),
                high=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True)),
            device=cpu,
            dtype=torch.float32,
            domain=continuous),
        solde: UnboundedContinuous(
            shape=torch.Size([1]),
            space=ContinuousBox(
                low=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True),
                high=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True)),
            device=cpu,
            dtype=torch.float32,
            domain=continuous),
        params: Composite(
            a: UnboundedContinuous(
                shape=torch.Size([]),
                space=ContinuousBox(
                    low=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, contiguous=True),
                    high=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, contiguous=True)),
                device=cpu,
                dtype=torch.float32,
                domain=continuous),
            b: UnboundedDiscrete(
                shape=torch.Size([]),
                space=ContinuousBox(
                    low=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.int64, contiguous=True),
                    high=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.int64, contiguous=True)),
                device=cpu,
                dtype=torch.int64,
                domain=discrete),
            h: UnboundedContinuous(
                shape=torch.Size([]),
                space=ContinuousBox(
                    low=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, contiguous=True),
                    high=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, contiguous=True)),
                device=cpu,
                dtype=torch.float32,
                domain=continuous),
            k: UnboundedContinuous(
                shape=torch.Size([]),
                space=ContinuousBox(
                    low=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, contiguous=True),
                    high=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, contiguous=True)),
                device=cpu,
                dtype=torch.float32,
                domain=continuous),
            dt: UnboundedContinuous(
                shape=torch.Size([]),
                space=ContinuousBox(
                    low=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, contiguous=True),
                    high=Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, contiguous=True)),
                device=cpu,
                dtype=torch.float32,
                domain=continuous),
            device=cpu,
            shape=torch.Size([]),
            data_cls=None),
        step_count: BoundedDiscrete(
            shape=torch.Size([1]),
            space=ContinuousBox(
                low=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.int64, contiguous=True),
                high=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.int64, contiguous=True)),
            device=cpu,
            dtype=torch.int64,
            domain=discrete),
        device=cpu,
        shape=torch.Size([]),
        data_cls=None),
    full_action_spec: Composite(
        action: Categorical(
            shape=torch.Size([1]),
            space=CategoricalBox(n=3),
            device=cpu,
            dtype=torch.int64,
            domain=discrete),
        device=cpu,
        shape=torch.Size([]),
        data_cls=None),
    device=cpu,
    shape=torch.Size([]),
    data_cls=None)
action_spec (as defined by input_spec): Categorical(
    shape=torch.Size([1]),
    space=CategoricalBox(n=3),
    device=cpu,
    dtype=torch.int64,
    domain=discrete)
2025-08-09 10:51:22,401 [torchrl][INFO]    check_env_specs succeeded! [END]
rollout of three steps: TensorDict(
    fields={
        action: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.int64, is_shared=False),
        angle: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.float32, is_shared=False),
        bid: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.float32, is_shared=False),
        done: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.bool, is_shared=False),
        next: TensorDict(
            fields={
                angle: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.float32, is_shared=False),
                bid: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.float32, is_shared=False),
                done: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                observation: Tensor(shape=torch.Size([3, 5]), device=cpu, dtype=torch.float32, is_shared=False),
                params: TensorDict(
                    fields={
                        a: Tensor(shape=torch.Size([3]), device=cpu, dtype=torch.float32, is_shared=False),
                        b: Tensor(shape=torch.Size([3]), device=cpu, dtype=torch.int64, is_shared=False),
                        dt: Tensor(shape=torch.Size([3]), device=cpu, dtype=torch.float32, is_shared=False),
                        h: Tensor(shape=torch.Size([3]), device=cpu, dtype=torch.float32, is_shared=False),
                        k: Tensor(shape=torch.Size([3]), device=cpu, dtype=torch.float32, is_shared=False)},
                    batch_size=torch.Size([3]),
                    device=None,
                    is_shared=False),
                posA: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.float32, is_shared=False),
                reward: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.float32, is_shared=False),
                solde: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.float32, is_shared=False),
                step_count: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.int64, is_shared=False),
                t: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.float32, is_shared=False),
                terminated: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.bool, is_shared=False)},
            batch_size=torch.Size([3]),
            device=None,
            is_shared=False),
        observation: Tensor(shape=torch.Size([3, 5]), device=cpu, dtype=torch.float32, is_shared=False),
        params: TensorDict(
            fields={
                a: Tensor(shape=torch.Size([3]), device=cpu, dtype=torch.float32, is_shared=False),
                b: Tensor(shape=torch.Size([3]), device=cpu, dtype=torch.int64, is_shared=False),
                dt: Tensor(shape=torch.Size([3]), device=cpu, dtype=torch.float32, is_shared=False),
                h: Tensor(shape=torch.Size([3]), device=cpu, dtype=torch.float32, is_shared=False),
                k: Tensor(shape=torch.Size([3]), device=cpu, dtype=torch.float32, is_shared=False)},
            batch_size=torch.Size([3]),
            device=None,
            is_shared=False),
        posA: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.float32, is_shared=False),
        solde: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.float32, is_shared=False),
        step_count: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.int64, is_shared=False),
        t: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.float32, is_shared=False),
        terminated: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.bool, is_shared=False)},
    batch_size=torch.Size([3]),
    device=None,
    is_shared=False)
Shape of the rollout TensorDict: torch.Size([3])

* reset (batch size of 10) TensorDict(
    fields={
        angle: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.float32, is_shared=False),
        bid: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.float32, is_shared=False),
        done: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.bool, is_shared=False),
        observation: Tensor(shape=torch.Size([10, 5]), device=cpu, dtype=torch.float32, is_shared=False),
        params: TensorDict(
            fields={
                a: Tensor(shape=torch.Size([10]), device=cpu, dtype=torch.float32, is_shared=False),
                b: Tensor(shape=torch.Size([10]), device=cpu, dtype=torch.int64, is_shared=False),
                dt: Tensor(shape=torch.Size([10]), device=cpu, dtype=torch.float32, is_shared=False),
                h: Tensor(shape=torch.Size([10]), device=cpu, dtype=torch.float32, is_shared=False),
                k: Tensor(shape=torch.Size([10]), device=cpu, dtype=torch.float32, is_shared=False)},
            batch_size=torch.Size([10]),
            device=None,
            is_shared=False),
        posA: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.float32, is_shared=False),
        solde: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.float32, is_shared=False),
        step_count: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.int64, is_shared=False),
        t: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.float32, is_shared=False),
        terminated: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.bool, is_shared=False)},
    batch_size=torch.Size([10]),
    device=None,
    is_shared=False)

* rand step (batch size of 10) TensorDict(
    fields={
        action: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.int64, is_shared=False),
        angle: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.float32, is_shared=False),
        bid: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.float32, is_shared=False),
        done: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.bool, is_shared=False),
        next: TensorDict(
            fields={
                angle: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.float32, is_shared=False),
                bid: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.float32, is_shared=False),
                done: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                observation: Tensor(shape=torch.Size([10, 5]), device=cpu, dtype=torch.float32, is_shared=False),
                params: TensorDict(
                    fields={
                        a: Tensor(shape=torch.Size([10]), device=cpu, dtype=torch.float32, is_shared=False),
                        b: Tensor(shape=torch.Size([10]), device=cpu, dtype=torch.int64, is_shared=False),
                        dt: Tensor(shape=torch.Size([10]), device=cpu, dtype=torch.float32, is_shared=False),
                        h: Tensor(shape=torch.Size([10]), device=cpu, dtype=torch.float32, is_shared=False),
                        k: Tensor(shape=torch.Size([10]), device=cpu, dtype=torch.float32, is_shared=False)},
                    batch_size=torch.Size([10]),
                    device=None,
                    is_shared=False),
                posA: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.float32, is_shared=False),
                reward: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.float32, is_shared=False),
                solde: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.float32, is_shared=False),
                step_count: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.int64, is_shared=False),
                t: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.float32, is_shared=False),
                terminated: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.bool, is_shared=False)},
            batch_size=torch.Size([10]),
            device=None,
            is_shared=False),
        observation: Tensor(shape=torch.Size([10, 5]), device=cpu, dtype=torch.float32, is_shared=False),
        params: TensorDict(
            fields={
                a: Tensor(shape=torch.Size([10]), device=cpu, dtype=torch.float32, is_shared=False),
                b: Tensor(shape=torch.Size([10]), device=cpu, dtype=torch.int64, is_shared=False),
                dt: Tensor(shape=torch.Size([10]), device=cpu, dtype=torch.float32, is_shared=False),
                h: Tensor(shape=torch.Size([10]), device=cpu, dtype=torch.float32, is_shared=False),
                k: Tensor(shape=torch.Size([10]), device=cpu, dtype=torch.float32, is_shared=False)},
            batch_size=torch.Size([10]),
            device=None,
            is_shared=False),
        posA: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.float32, is_shared=False),
        solde: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.float32, is_shared=False),
        step_count: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.int64, is_shared=False),
        t: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.float32, is_shared=False),
        terminated: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.bool, is_shared=False)},
    batch_size=torch.Size([10]),
    device=None,
    is_shared=False)

* Running policy: TensorDict(
    fields={
        action: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, is_shared=False),
        action_log_prob: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, is_shared=False),
        angle: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, is_shared=False),
        bid: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, is_shared=False),
        done: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False),
        loc: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, is_shared=False),
        observation: Tensor(shape=torch.Size([5]), device=cpu, dtype=torch.float32, is_shared=False),
        params: TensorDict(
            fields={
                a: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False),
                b: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.int64, is_shared=False),
                dt: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False),
                h: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False),
                k: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False)},
            batch_size=torch.Size([]),
            device=None,
            is_shared=False),
        posA: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, is_shared=False),
        scale: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, is_shared=False),
        solde: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, is_shared=False),
        step_count: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.int64, is_shared=False),
        t: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, is_shared=False),
        terminated: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False)},
    batch_size=torch.Size([]),
    device=None,
    is_shared=False)

* Running criticvalue: TensorDict(
    fields={
        angle: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, is_shared=False),
        bid: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, is_shared=False),
        done: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False),
        observation: Tensor(shape=torch.Size([5]), device=cpu, dtype=torch.float32, is_shared=False),
        params: TensorDict(
            fields={
                a: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False),
                b: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.int64, is_shared=False),
                dt: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False),
                h: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False),
                k: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False)},
            batch_size=torch.Size([]),
            device=None,
            is_shared=False),
        posA: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, is_shared=False),
        solde: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, is_shared=False),
        state_value: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, is_shared=False),
        step_count: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.int64, is_shared=False),
        t: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, is_shared=False),
        terminated: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False)},
    batch_size=torch.Size([]),
    device=None,
    is_shared=False)

And then I have an error(and certainly many others) in loss_module with this traceback:

Traceback (most recent call last):
  File "/home/fauche/Documents/RenforcementLearning/CosPPO.py", line 190, in <module>
    loss_module = ClipPPOLoss(actor = policy_module,
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fauche/Documents/IAvenv/lib/python3.11/site-packages/torchrl/objectives/ppo.py", line 1094, in __init__
    super().__init__(
  File "/home/fauche/Documents/IAvenv/lib/python3.11/site-packages/torchrl/objectives/ppo.py", line 490, in __init__
    raise TypeError(_GAMMA_LMBDA_DEPREC_ERROR)
TypeError: Passing gamma / lambda parameters through the loss constructor is a deprecated feature. To customize your value function, run `loss_module.make_value_estimator(ValueEstimators.<value_fun>, gamma=val)`.

Thank you very much for your help !

Jean-Marc_Fauche · August 9, 2025, 9:45am

OK I understand… I take an old tuto as a reference ! I modify “CosPPO.py” and I come back.

Jean-Marc_Fauche · August 9, 2025, 1:10pm

Modified code for “CosPPO.py”:

from collections import defaultdict

import matplotlib.pyplot as plt
import torch
from tensordict.nn import TensorDictModule
from tensordict.nn.distributions import NormalParamExtractor
from torch import nn
from torch.distributions import Normal

from torchrl.collectors import SyncDataCollector
from torchrl.data.replay_buffers import ReplayBuffer
from torchrl.data.replay_buffers.samplers import SamplerWithoutReplacement
from torchrl.data.replay_buffers.storages import LazyTensorStorage
from torchrl.envs import (
    Compose,
    DoubleToFloat,
    ObservationNorm,
    StepCounter,
    TransformedEnv,
    UnsqueezeTransform,
    CatTensors,
)
from torchrl.envs.libs.gym import GymEnv
from torchrl.envs.utils import check_env_specs, ExplorationType, set_exploration_type
from torchrl.modules import ProbabilisticActor, TanhNormal, ValueOperator
from torchrl.objectives import ClipPPOLoss
from torchrl.objectives.value import GAE
#from tqdm import tqdm


#is_fork = multiprocessing.get_start_method() == "fork"
device = (torch.device("cpu"))
"""          
    torch.device(0)
    if torch.cuda.is_available() and not is_fork
    else torch.device("cpu"))
"""    
num_cells = 256  # number of cells in each layer i.e. output dim.
lr = 3e-4
max_grad_norm = 1.0


frames_per_batch = 1000
# For a complete training, bring the number of frames up to 1M
total_frames = 10_000


sub_batch_size = 64  # cardinality of the sub-samples gathered from the current data in the inner loop
num_epochs = 10  # optimization steps per batch of data collected
clip_epsilon = (
    0.2  # clip value for PPO loss: see the equation in the intro for more context.
)
gamma = 0.99
lmbda = 0.95
entropy_eps = 1e-4


import CosTrader
env = CosTrader.CosTraderEnv()#batch_size = torch.Size([8]))
# Transform ******************
obs_keys = env.get_obskeys()
env = TransformedEnv(
    env,
    # Unsqueezes the observations that we will concatenate    
    UnsqueezeTransform(
        dim=-1,
        in_keys=obs_keys,
        in_keys_inv=obs_keys,
    ),
)
cat_transform = CatTensors(
    in_keys=obs_keys, dim=-1, out_key="observation", del_keys=False
)
env.append_transform(cat_transform)
env = TransformedEnv(
    env,
    Compose(
        # normalize observations
        ObservationNorm(in_keys=["observation"]),
        DoubleToFloat(),
        StepCounter(),
    ),
)

# ******************************


env.transform[-3].init_stats(num_iter=1000, reduce_dim=0, cat_dim=0)


print("normalization constant shape:", env.transform[-3].loc.shape)

print("observation_spec:", env.observation_spec)
print("reward_spec:", env.reward_spec)
print("input_spec:", env.input_spec)
print("action_spec (as defined by input_spec):", env.action_spec)

check_env_specs(env)

rollout = env.rollout(3)
print("rollout of three steps:", rollout)
print("Shape of the rollout TensorDict:", rollout.batch_size)

######################################################################
# Batching computations

batch_size = 10   
td = env.reset(env.gen_params(batch_size=[batch_size]))
print("\n* reset (batch size of 10)", td)

td = env.rand_step(td)
print("\n* rand step (batch size of 10)", td)


# Hyperparamètres ---------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_cells = 64  # number of cells in each layer i.e. output dim.
lr = 2e-3
max_grad_norm = 1.0
# -------------------------------------------------------------

policy_net = nn.Sequential(nn.LazyLinear(num_cells, device=device),
                          nn.Tanh(),
                          nn.LazyLinear(num_cells, device=device),
                          nn.Tanh(),
                          nn.LazyLinear(num_cells, device=device),
                          nn.Tanh(),
                          nn.LazyLinear(2 * env.action_spec.shape[-1], device=device),
                          NormalParamExtractor())

policy_module = TensorDictModule(policy_net,
                                 in_keys = ["observation"],
                                 out_keys = ["loc", "scale"])

policy_module = ProbabilisticActor(module=policy_module,
                                   spec = env.action_spec,
                                   in_keys = ["loc", "scale"],
                                   out_keys= ["action"],
                                   distribution_class = Normal,#TanhNormal,
                                   #distribution_kwargs = {"min": -1, "max": 1},
                                   return_log_prob = True)

critic_net = nn.Sequential(nn.LazyLinear(num_cells, device=device),
                          nn.Tanh(),
                          nn.LazyLinear(num_cells, device=device),
                          nn.Tanh(),
                          nn.LazyLinear(num_cells, device=device),
                          nn.Tanh(),
                          nn.LazyLinear(1, device=device))

critic = ValueOperator(module = critic_net,
                             in_keys = ["observation"])

print("\n* Running policy:", policy_module(env.reset()))
print("\n* Running criticvalue:", critic(env.reset()))


# Data paramètres ---------------------------------------------
frame_skip = 1  
frames_per_batch = 150 // frame_skip
total_frames = 40_050 // frame_skip
minibatch_size = 64
# -------------------------------------------------------------

collector = SyncDataCollector(env,
                              policy_module,
                              frames_per_batch=frames_per_batch,
                              total_frames=total_frames,
                              split_trajs=False,
                              device=device,)


replay_buffer = ReplayBuffer(storage = LazyTensorStorage(frames_per_batch),
                             sampler = SamplerWithoutReplacement(),
                             batch_size = minibatch_size)

# PPO paramètres ---------------------------------------------
num_epochs = 10  # optimization steps per batch of data collected
clip_epsilon = (0.2)  # clip value for PPO loss: see the equation in the intro for more context.
gamma = 0.99 # [0 ,1]
lmbda = 0.95 # [0 ,1]
entropy_eps = 1e-4
# -------------------------------------------------------------

advantage_module = GAE(gamma = gamma,
                       lmbda = lmbda,
                       value_network = critic,
                       average_gae = True)

loss_module = ClipPPOLoss(
    actor_network=policy_module,
    critic_network= critic,
    clip_epsilon=clip_epsilon,
    entropy_bonus=bool(entropy_eps),
    entropy_coef=entropy_eps,
    # these keys match by default but we set this for completeness
    critic_coef=1.0,
    loss_critic_type="smooth_l1",
)

optim = torch.optim.Adam(loss_module.parameters(), lr)

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, total_frames // frames_per_batch, 0.0)

logs = defaultdict(list)

total=total_frames * frame_skip
faits = 0
eval_str = ""

print("TRAINING LOOP ***********************************************************")
for i, tensordict_data in enumerate(collector):
    # we now have a batch of data to work with. Let's learn something from it.
    #print(tensordict_data["action"])
    for _ in range(num_epochs):
        # We'll need an "advantage" signal to make PPO work.
        # We re-compute it at each epoch as its value depends on the value
        # network which is updated in the inner loop.
        advantage_module(tensordict_data)
        
        data_view = tensordict_data.reshape(-1)
        
        replay_buffer.extend(data_view.cpu())
        
        for _ in range(frames_per_batch // minibatch_size):
            subdata = replay_buffer.sample(minibatch_size)
            loss_vals = loss_module(subdata.to(device))
            loss_value = (loss_vals["loss_objective"] + loss_vals["loss_critic"] + loss_vals["loss_entropy"])

            # Optimization: backward, grad clipping and optimization step
            loss_value.backward()
            
            # this is not strictly mandatory but it's good practice to keep gradient norm bounded
            torch.nn.utils.clip_grad_norm_(loss_module.parameters(), max_grad_norm)
            optim.step()
            optim.zero_grad()

    logs["reward"].append(tensordict_data["next", "reward"].mean().item())
    faits += tensordict_data.numel() * frame_skip
    cum_reward_str = (f"average reward={logs['reward'][-1]: 4.4f}")# (init={logs['reward'][0]: 4.4f})")
    #logs["step_count"].append(tensordict_data["step_count"].max().item())
    #stepcount_str = f"step count (max): {logs['step_count'][-1]}"
    logs["lr"].append(optim.param_groups[0]["lr"])
    lr_str = f"lr policy: {logs['lr'][-1]: 4.4f}"
    if i % 10 == 0:
        with set_exploration_type(ExplorationType.DETERMINISTIC), torch.no_grad():
            # execute a rollout with the trained policy
            eval_rollout = env.rollout(1000, policy_module)
            logs["eval reward"].append(eval_rollout["next", "reward"].mean().item())
            logs["eval reward (sum)"].append(eval_rollout["next", "reward"].sum().item())
            #logs["eval step_count"].append(eval_rollout["step_count"].max().item())
            eval_str = (f"cum reward: {logs['eval reward (sum)'][-1]: 4.4f} "
                        #f"step-count: {logs['eval step_count'][-1]}"
                        )
            
            del eval_rollout
    collector.update_policy_weights_() ## ??
    progres = 100*faits/total
    print(f"{progres: 3.1f}%",", ".join([eval_str, cum_reward_str, lr_str]))
    # We're also using a learning rate scheduler. Like the gradient clipping,
    # this is a nice-to-have but nothing necessary for PPO to work.
    scheduler.step()

plt.figure(figsize=(10, 10))
plt.subplot(2, 2, 1)
plt.plot(logs["reward"])
plt.title("training rewards (average)")
plt.subplot(2, 2, 2)
#plt.plot(logs["step_count"])
#plt.title("Max step count (training)")
plt.subplot(2, 2, 3)
plt.plot(logs["eval reward (sum)"])
plt.title("Return (test)")
plt.subplot(2, 2, 4)
#plt.plot(logs["eval step_count"])
#plt.title("Max step count (test)")
plt.show()

But It don’t learn, and when looking at the actions I see float, not int ! So I tried safe = True in ProbalisticActor, actions are float but correspond (0.0,1.0,2.0) is it the right way ?

Jean-Marc_Fauche · August 13, 2025, 5:28pm

Ok, I’ve changed distribution_class to MaskedCategorical (because I add a dynamic mask, other why a Categorical distribution should be used for the upside version). I’ve changed in_keys too with logits and action_mask in ProbabilisticActor, and the out_keys from net to “logits”. I removed NormalParamExtractor too. And other changes…

vmoens · August 14, 2025, 8:25am

Hello
For categorical actions you should definitely use a Categorical distribution.
Here are some pointers:

PPO with categorical actions example: rl/sota-implementations/ppo/ppo_atari.py at main · pytorch/rl · GitHub
envs with masked actions: torchrl.envs package — torchrl 0.9 documentation
action masking transform ActionMask — torchrl 0.9 documentation

LMK if you have specific questions about these!

Jean-Marc_Fauche · August 14, 2025, 5:41pm

Hello, when taking your action mask example, your Env don’t pass “check_env_specs”:

import torch
from torchrl.data.tensor_specs import Categorical, Binary, Unbounded, Composite
from torchrl.envs.transforms import ActionMask, TransformedEnv
from torchrl.envs.common import EnvBase
from torchrl.envs.utils import check_env_specs

class MaskedEnv(EnvBase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.action_spec = Categorical(4)
        self.state_spec = Composite(action_mask=Binary(4, dtype=torch.bool))
        self.observation_spec = Composite(obs=Unbounded(3))
        self.reward_spec = Unbounded(1)
        
    def _reset(self, tensordict=None):
        td = self.observation_spec.rand()
        td.update(torch.ones_like(self.state_spec.rand()))
        return td
    
    def _step(self, data):
        td = self.observation_spec.rand()
        mask = data.get("action_mask")
        action = data.get("action")
        mask = mask.scatter(-1, action.unsqueeze(-1), 0)
        td.set("action_mask", mask)
        td.set("reward", self.reward_spec.rand())
        td.set("done", ~mask.any().view(1))
        return td

    def _set_seed(self, seed) -> None:
        pass
torch.manual_seed(0)
base_env = MaskedEnv()
env = TransformedEnv(base_env, ActionMask())
check_env_specs(env)

it return this traceback:

Traceback (most recent call last):
  File "/home/fauche/Documents/RenforcementLearning/testActionMask.py", line 34, in <module>
    check_env_specs(env)
  File "/home/fauche/Documents/IAvenv/lib/python3.11/site-packages/torchrl/envs/utils.py", line 789, in check_env_specs
    raise AssertionError(
AssertionError: The keys of the specs and data do not match:
    - List of keys present in real but not in fake: {('next', 'action_mask')},
    - List of keys present in fake but not in real: set().

vmoens · August 14, 2025, 7:47pm

Good catch, the action mask should be registered in the observation spec too!