Skip to content

Reinforcement Learning PPO Tutorial doens't work #3228

Closed
@Achilles718611

Description

@Achilles718611

📚 The doc issue

https://pytorch.org/tutorials/intermediate/reinforcement_ppo.html

My Environment

Windows 11 Pro
NVIDIA GeForce RTX 3060 Ti

Python 3.12.7

torch==2.5.1+cu124
torchrl==0.6.0
torchvision==0.20.1+cu124
gym==0.26.2
gymnasium[classic_control]==1.0.0

As I have installed both gym and gymnasium, I set backend explicitly when initialize environment.

with set_gym_backend("gym"):
    base_env = GymEnv("InvertedDoublePendulum-v4", device=device)

Everything is OK until the last step.

check_env_specs(env)
2025-01-10 18:56:09,849 [torchrl][INFO] check_env_specs succeeded!

Here is the code.

logs = defaultdict(list)
pbar = tqdm(total=total_frames)
eval_str = ""

# We iterate over the collector until it reaches the total number of frames it was
# designed to collect:
for i, tensordict_data in enumerate(collector):
    # we now have a batch of data to work with. Let's learn something from it.
    for _ in range(num_epochs):
        # We'll need an "advantage" signal to make PPO work.
        # We re-compute it at each epoch as its value depends on the value
        # network which is updated in the inner loop.
        advantage_module(tensordict_data)
        data_view = tensordict_data.reshape(-1)
        replay_buffer.extend(data_view.cpu())
        for _ in range(frames_per_batch // sub_batch_size):
            subdata = replay_buffer.sample(sub_batch_size)
            loss_vals = loss_module(subdata.to(device))
            loss_value = (
                loss_vals["loss_objective"]
                + loss_vals["loss_critic"]
                + loss_vals["loss_entropy"]
            )

            # Optimization: backward, grad clipping and optimization step
            loss_value.backward()
            # this is not strictly mandatory but it's good practice to keep
            # your gradient norm bounded
            torch.nn.utils.clip_grad_norm_(loss_module.parameters(), max_grad_norm)
            optim.step()
            optim.zero_grad()

    logs["reward"].append(tensordict_data["next", "reward"].mean().item())
    pbar.update(tensordict_data.numel())
    cum_reward_str = (
        f"average reward={logs['reward'][-1]: 4.4f} (init={logs['reward'][0]: 4.4f})"
    )
    logs["step_count"].append(tensordict_data["step_count"].max().item())
    stepcount_str = f"step count (max): {logs['step_count'][-1]}"
    logs["lr"].append(optim.param_groups[0]["lr"])
    lr_str = f"lr policy: {logs['lr'][-1]: 4.4f}"
    if i % 10 == 0:
        # We evaluate the policy once every 10 batches of data.
        # Evaluation is rather simple: execute the policy without exploration
        # (take the expected value of the action distribution) for a given
        # number of steps (1000, which is our ``env`` horizon).
        # The ``rollout`` method of the ``env`` can take a policy as argument:
        # it will then execute this policy at each step.
        with set_exploration_type(ExplorationType.MEAN), torch.no_grad():
            # execute a rollout with the trained policy
            eval_rollout = env.rollout(1000, policy_module)
            logs["eval reward"].append(eval_rollout["next", "reward"].mean().item())
            logs["eval reward (sum)"].append(
                eval_rollout["next", "reward"].sum().item()
            )
            logs["eval step_count"].append(eval_rollout["step_count"].max().item())
            eval_str = (
                f"eval cumulative reward: {logs['eval reward (sum)'][-1]: 4.4f} "
                f"(init: {logs['eval reward (sum)'][0]: 4.4f}), "
                f"eval step-count: {logs['eval step_count'][-1]}"
            )
            del eval_rollout
    pbar.set_description(", ".join([eval_str, cum_reward_str, stepcount_str, lr_str]))

    # We're also using a learning rate scheduler. Like the gradient clipping,
    # this is a nice-to-have but nothing necessary for PPO to work.
    scheduler.step()

Here are error details.

  2%|██▌                                                                                                                            | 1000/50000 [00:04<03:40, 221.83it/s]
---------------------------------------------------------------------------
NotImplementedError                       Traceback (most recent call last)
Cell In[17], line 51
     42 if i % 10 == 0:
     43     # We evaluate the policy once every 10 batches of data.
     44     # Evaluation is rather simple: execute the policy without exploration
   (...)
     47     # The ``rollout`` method of the ``env`` can take a policy as argument:
     48     # it will then execute this policy at each step.
     49     with set_exploration_type(ExplorationType.MEAN), torch.no_grad():
     50         # execute a rollout with the trained policy
---> 51         eval_rollout = env.rollout(1000, policy_module)
     52         logs["eval reward"].append(eval_rollout["next", "reward"].mean().item())
     53         logs["eval reward (sum)"].append(
     54             eval_rollout["next", "reward"].sum().item()
     55         )

File D:\Study\venv\Lib\site-packages\torchrl\envs\common.py:2635, in EnvBase.rollout(self, max_steps, policy, callback, auto_reset, auto_cast_to_device, break_when_any_done, break_when_all_done, return_contiguous, tensordict, set_truncated, out, trust_policy)
   2625 kwargs = {
   2626     "tensordict": tensordict,
   2627     "auto_cast_to_device": auto_cast_to_device,
   (...)
   2632     "callback": callback,
   2633 }
   2634 if break_when_any_done or break_when_all_done:
-> 2635     tensordicts = self._rollout_stop_early(
   2636         break_when_all_done=break_when_all_done,
   2637         break_when_any_done=break_when_any_done,
   2638         **kwargs,
   2639     )
   2640 else:
   2641     tensordicts = self._rollout_nonstop(**kwargs)

File D:\Study\venv\Lib\site-packages\torchrl\envs\common.py:2722, in EnvBase._rollout_stop_early(self, break_when_any_done, break_when_all_done, tensordict, auto_cast_to_device, max_steps, policy, policy_device, env_device, callback)
   2719     else:
   2720         tensordict.clear_device_()
-> 2722 tensordict = policy(tensordict)
   2723 if auto_cast_to_device:
   2724     if env_device is not None:

File D:\Study\venv\Lib\site-packages\torch\nn\modules\module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
   1734     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1735 else:
-> 1736     return self._call_impl(*args, **kwargs)

File D:\Study\venv\Lib\site-packages\torch\nn\modules\module.py:1747, in Module._call_impl(self, *args, **kwargs)
   1742 # If we don't have any hooks, we want to skip the rest of the logic in
   1743 # this function, and just call forward.
   1744 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1745         or _global_backward_pre_hooks or _global_backward_hooks
   1746         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747     return forward_call(*args, **kwargs)
   1749 result = None
   1750 called_always_called_hooks = set()

File D:\Study\venv\Lib\site-packages\tensordict\nn\common.py:314, in dispatch.__call__.<locals>.wrapper(*args, **kwargs)
    311     return out[0] if len(out) == 1 else out
    313 if _self is not None:
--> 314     return func(_self, tensordict, *args, **kwargs)
    315 return func(tensordict, *args, **kwargs)

File D:\Study\venv\Lib\site-packages\tensordict\nn\utils.py:359, in _set_skip_existing_None.__call__.<locals>.wrapper(_self, tensordict, *args, **kwargs)
    357 self.prev = _SKIP_EXISTING
    358 try:
--> 359     result = func(_self, tensordict, *args, **kwargs)
    360 finally:
    361     _SKIP_EXISTING = self.prev

File D:\Study\venv\Lib\site-packages\tensordict\nn\probabilistic.py:622, in ProbabilisticTensorDictSequential.forward(self, tensordict, tensordict_out, **kwargs)
    613 @dispatch(auto_batch_size=False)
    614 @_set_skip_existing_None()
    615 def forward(
   (...)
    619     **kwargs,
    620 ) -> TensorDictBase:
    621     tensordict_out = self.get_dist_params(tensordict, tensordict_out, **kwargs)
--> 622     return self.module[-1](tensordict_out, _requires_sample=self._requires_sample)

File D:\Study\venv\Lib\site-packages\torch\nn\modules\module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
   1734     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1735 else:
-> 1736     return self._call_impl(*args, **kwargs)

File D:\Study\venv\Lib\site-packages\torch\nn\modules\module.py:1747, in Module._call_impl(self, *args, **kwargs)
   1742 # If we don't have any hooks, we want to skip the rest of the logic in
   1743 # this function, and just call forward.
   1744 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1745         or _global_backward_pre_hooks or _global_backward_hooks
   1746         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747     return forward_call(*args, **kwargs)
   1749 result = None
   1750 called_always_called_hooks = set()

File D:\Study\venv\Lib\site-packages\tensordict\nn\common.py:314, in dispatch.__call__.<locals>.wrapper(*args, **kwargs)
    311     return out[0] if len(out) == 1 else out
    313 if _self is not None:
--> 314     return func(_self, tensordict, *args, **kwargs)
    315 return func(tensordict, *args, **kwargs)

File D:\Study\venv\Lib\site-packages\tensordict\nn\utils.py:359, in _set_skip_existing_None.__call__.<locals>.wrapper(_self, tensordict, *args, **kwargs)
    357 self.prev = _SKIP_EXISTING
    358 try:
--> 359     result = func(_self, tensordict, *args, **kwargs)
    360 finally:
    361     _SKIP_EXISTING = self.prev

File D:\Study\venv\Lib\site-packages\tensordict\nn\probabilistic.py:393, in ProbabilisticTensorDictModule.forward(self, tensordict, tensordict_out, _requires_sample)
    391 dist = self.get_dist(tensordict)
    392 if _requires_sample:
--> 393     out_tensors = self._dist_sample(dist, interaction_type=interaction_type())
    394     if isinstance(out_tensors, TensorDictBase):
    395         if self.return_log_prob:

File D:\Study\venv\Lib\site-packages\tensordict\nn\probabilistic.py:490, in ProbabilisticTensorDictModule._dist_sample(self, dist, interaction_type)
    485         raise NotImplementedError(
    486             f"method {type(dist)}.median is not implemented"
    487         )
    489 elif interaction_type is InteractionType.MEAN:
--> 490     if hasattr(dist, "mean"):
    491         try:
    492             return dist.mean

File D:\Study\venv\Lib\site-packages\torchrl\modules\distributions\continuous.py:551, in TanhNormal.mean(self)
    549 @property
    550 def mean(self):
--> 551     raise NotImplementedError(
    552         f"{type(self).__name__} does not have a closed form formula for the average. "
    553         "Am estimate of this value can be computed using dist.sample((N,)).mean(dim=0), "
    554         "where N is a large number of samples."
    555     )

NotImplementedError: TanhNormal does not have a closed form formula for the average. Am estimate of this value can be computed using dist.sample((N,)).mean(dim=0), where N is a large number of samples.

Suggest a potential alternative/fix

No response

cc @vmoens @nairbv

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions