Hello, I have a PPO network that is meant to receive noise, block error rate(BLER) and difference between current BLER and previous BLER, but reward is moving towards 0 instead of increasing.
(It is this example but modified)
num_cells = 2048 # number of cells in each layer i.e. output dim.
lr = 1e-4
max_grad_norm = 1.0
frames_per_batch = 1000
# For a complete training, bring the number of frames up to 1M
total_frames = 1_00_000
sub_batch_size = 512 # cardinality of the sub-samples gathered from the current data in the inner loop
num_epochs = 10 # optimization steps per batch of data collected
clip_epsilon = (
0.2 # clip value for PPO loss: see the equation in the intro for more context.
)
gamma = 0.99
lmbda = 0.95
entropy_eps = 1e-4
env = MatlabEnv(device=device)
env = TransformedEnv(
env,
Compose(
# normalize observations
UnsqueezeTransform(
dim=-1,
in_keys=["sinr", "blera", "blerc"],
in_keys_inv=["sinr", "blera", "blerc"],
),
CatTensors(in_keys=["sinr", "blera", "blerc"], dim=-1, out_key="observation", del_keys=False),
ObservationNorm(in_keys=["observation"]),
DoubleToFloat(),
StepCounter(),
),
)
env.transform[2].init_stats(num_iter=1000, reduce_dim=0, cat_dim=0)
actor_net = nn.Sequential(
nn.LazyLinear(num_cells, device=device),
nn.Tanh(),
nn.LazyLinear(num_cells, device=device),
nn.Tanh(),
nn.LazyLinear(num_cells, device=device),
nn.Tanh(),
nn.LazyLinear(2 * env.action_spec.shape[-1], device=device),
NormalParamExtractor(),
)
policy_module = TensorDictModule(
actor_net, in_keys=["observation"], out_keys=["loc", "scale"]
)
policy_module = ProbabilisticActor(
module=policy_module,
spec=env.action_spec,
in_keys=["loc", "scale"],
distribution_class=TanhNormal,
distribution_kwargs={
"low": env.action_spec.space.low,
"high": env.action_spec.space.high,
},
return_log_prob=True,
# we'll need the log-prob for the numerator of the importance weights
)
value_net = nn.Sequential(
nn.LazyLinear(num_cells, device=device),
nn.Tanh(),
nn.LazyLinear(num_cells, device=device),
nn.Tanh(),
nn.LazyLinear(num_cells, device=device),
nn.Tanh(),
nn.LazyLinear(1, device=device),
)
value_module = ValueOperator(
module=value_net,
in_keys=["observation"],
)
collector = SyncDataCollector(
env,
policy_module,
frames_per_batch=frames_per_batch,
total_frames=total_frames,
split_trajs=False,
device=device,
)
replay_buffer = ReplayBuffer(
storage=LazyTensorStorage(max_size=frames_per_batch),
sampler=SamplerWithoutReplacement(),
)
advantage_module = GAE(
gamma=gamma, lmbda=lmbda, value_network=value_module, average_gae=True
)
loss_module = ClipPPOLoss(
actor_network=policy_module,
critic_network=value_module,
clip_epsilon=clip_epsilon,
entropy_bonus=bool(entropy_eps),
entropy_coef=entropy_eps,
# these keys match by default but we set this for completeness
critic_coef=1.0,
loss_critic_type="smooth_l1",
)
optim = torch.optim.Adam(loss_module.parameters(), lr)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
optim, total_frames // frames_per_batch, 0.0
)
logs = defaultdict(list)
pbar = tqdm(total=total_frames)
eval_str = ""
And training loop:
# designed to collect:
for i, tensordict_data in enumerate(collector):
# we now have a batch of data to work with. Let's learn something from it.
for _ in range(num_epochs):
# We'll need an "advantage" signal to make PPO work.
# We re-compute it at each epoch as its value depends on the value
# network which is updated in the inner loop.
advantage_module(tensordict_data)
data_view = tensordict_data.reshape(-1)
# print(data_view)
replay_buffer.extend(data_view.cpu())
for _ in range(frames_per_batch // sub_batch_size):
subdata = replay_buffer.sample(sub_batch_size)
loss_vals = loss_module(subdata.to(device))
loss_value = (
loss_vals["loss_objective"]
+ loss_vals["loss_critic"]
+ loss_vals["loss_entropy"]
)
# Optimization: backward, grad clipping and optimization step
loss_value.backward()
# this is not strictly mandatory but it's good practice to keep
# your gradient norm bounded
torch.nn.utils.clip_grad_norm_(loss_module.parameters(), max_grad_norm)
optim.step()
optim.zero_grad()
logs["reward"].append(tensordict_data["next", "reward"].mean().item())
logs["sinr"].append(tensordict_data["sinr"].tolist()[0])
pbar.update(tensordict_data.numel())
cum_reward_str = (
f"average reward={logs['reward'][-1]: 4.4f} (init={logs['reward'][0]: 4.4f})"
)
logs["step_count"].append(tensordict_data["step_count"].max().item())
stepcount_str = f"step count (max): {logs['step_count'][-1]}"
logs["lr"].append(optim.param_groups[0]["lr"])
lr_str = f"lr policy: {logs['lr'][-1]: 4.4f}"
if i % 10 == 0:
# We evaluate the policy once every 10 batches of data.
# Evaluation is rather simple: execute the policy without exploration
# (take the expected value of the action distribution) for a given
# number of steps (1000, which is our ``env`` horizon).
# The ``rollout`` method of the ``env`` can take a policy as argument:
# it will then execute this policy at each step.
with set_exploration_type(ExplorationType.DETERMINISTIC), torch.no_grad():
# execute a rollout with the trained policy
eval_rollout = env.rollout(1000, policy_module)
logs["eval reward"].append(eval_rollout["next", "reward"].mean().item())
logs["eval reward (sum)"].append(
eval_rollout["next", "reward"].sum().item()
)
logs["eval step_count"].append(eval_rollout["step_count"].max().item())
logs["action"].append((eval_rollout["action"]).mean().item())
eval_str = (
f"eval cumulative reward: {logs['eval reward (sum)'][-1]: 4.4f} "
f"(init: {logs['eval reward (sum)'][0]: 4.4f}), "
)
del eval_rollout
pbar.set_description(", ".join([eval_str, cum_reward_str, stepcount_str, lr_str]))
# We're also using a learning rate scheduler. Like the gradient clipping,
# this is a nice-to-have but nothing necessary for PPO to work.
scheduler.step()
My env step looks something like this:
sinr, blerc = tensordict["sinr"], tensordict["blerc"]
action = (tensordict["action"]).round().int().item()
# print(action)
result = get_bler_from_file([action], [sinr.item()])[0]
# print(result)
new_blerc = torch.tensor(result, dtype=torch.float32, device="cuda", requires_grad=True)
new_blera = new_blerc - blerc
spectral_efficiency = torch.tensor(float(se_table[0][action]*se_table[1][action]/1024), dtype=torch.float32)
# print("Action, spectral efficiency: ", action, " ", spectral_efficiency)
percentage_transmitted_blocks = (torch.tensor(float(1), dtype=torch.float32)-new_blerc)
percentage_spectral_efficiency = percentage_transmitted_blocks*spectral_efficiency
reward = percentage_spectral_efficiency
new_sinr = sinr + 0.1
if new_sinr.item() > 20.5:
new_sinr = new_sinr - 26.5
SINR is bound -6.5, 20.5
The NN always seems to converge to action = 1, no matter if reward is negative or positive. If negative, it will reach ~(-5.5) and if positive, it will reach 0. (As max reward can be 5.5)
Why does my network converge to 0 reward? I have tried increasing or decreasing lr or entropy to no avail, having more training time or more cells but nothing seems to help.
Results after training for 100k frames: