Closed
Description
📚 The doc issue
https://pytorch.org/tutorials/intermediate/reinforcement_ppo.html
My Environment
Windows 11 Pro
NVIDIA GeForce RTX 3060 Ti
Python 3.12.7
torch==2.5.1+cu124
torchrl==0.6.0
torchvision==0.20.1+cu124
gym==0.26.2
gymnasium[classic_control]==1.0.0
As I have installed both gym and gymnasium, I set backend explicitly when initialize environment.
with set_gym_backend("gym"):
base_env = GymEnv("InvertedDoublePendulum-v4", device=device)
Everything is OK until the last step.
check_env_specs(env)
2025-01-10 18:56:09,849 [torchrl][INFO] check_env_specs succeeded!
Here is the code.
logs = defaultdict(list)
pbar = tqdm(total=total_frames)
eval_str = ""
# We iterate over the collector until it reaches the total number of frames it was
# designed to collect:
for i, tensordict_data in enumerate(collector):
# we now have a batch of data to work with. Let's learn something from it.
for _ in range(num_epochs):
# We'll need an "advantage" signal to make PPO work.
# We re-compute it at each epoch as its value depends on the value
# network which is updated in the inner loop.
advantage_module(tensordict_data)
data_view = tensordict_data.reshape(-1)
replay_buffer.extend(data_view.cpu())
for _ in range(frames_per_batch // sub_batch_size):
subdata = replay_buffer.sample(sub_batch_size)
loss_vals = loss_module(subdata.to(device))
loss_value = (
loss_vals["loss_objective"]
+ loss_vals["loss_critic"]
+ loss_vals["loss_entropy"]
)
# Optimization: backward, grad clipping and optimization step
loss_value.backward()
# this is not strictly mandatory but it's good practice to keep
# your gradient norm bounded
torch.nn.utils.clip_grad_norm_(loss_module.parameters(), max_grad_norm)
optim.step()
optim.zero_grad()
logs["reward"].append(tensordict_data["next", "reward"].mean().item())
pbar.update(tensordict_data.numel())
cum_reward_str = (
f"average reward={logs['reward'][-1]: 4.4f} (init={logs['reward'][0]: 4.4f})"
)
logs["step_count"].append(tensordict_data["step_count"].max().item())
stepcount_str = f"step count (max): {logs['step_count'][-1]}"
logs["lr"].append(optim.param_groups[0]["lr"])
lr_str = f"lr policy: {logs['lr'][-1]: 4.4f}"
if i % 10 == 0:
# We evaluate the policy once every 10 batches of data.
# Evaluation is rather simple: execute the policy without exploration
# (take the expected value of the action distribution) for a given
# number of steps (1000, which is our ``env`` horizon).
# The ``rollout`` method of the ``env`` can take a policy as argument:
# it will then execute this policy at each step.
with set_exploration_type(ExplorationType.MEAN), torch.no_grad():
# execute a rollout with the trained policy
eval_rollout = env.rollout(1000, policy_module)
logs["eval reward"].append(eval_rollout["next", "reward"].mean().item())
logs["eval reward (sum)"].append(
eval_rollout["next", "reward"].sum().item()
)
logs["eval step_count"].append(eval_rollout["step_count"].max().item())
eval_str = (
f"eval cumulative reward: {logs['eval reward (sum)'][-1]: 4.4f} "
f"(init: {logs['eval reward (sum)'][0]: 4.4f}), "
f"eval step-count: {logs['eval step_count'][-1]}"
)
del eval_rollout
pbar.set_description(", ".join([eval_str, cum_reward_str, stepcount_str, lr_str]))
# We're also using a learning rate scheduler. Like the gradient clipping,
# this is a nice-to-have but nothing necessary for PPO to work.
scheduler.step()
Here are error details.
2%|██▌ | 1000/50000 [00:04<03:40, 221.83it/s]
---------------------------------------------------------------------------
NotImplementedError Traceback (most recent call last)
Cell In[17], line 51
42 if i % 10 == 0:
43 # We evaluate the policy once every 10 batches of data.
44 # Evaluation is rather simple: execute the policy without exploration
(...)
47 # The ``rollout`` method of the ``env`` can take a policy as argument:
48 # it will then execute this policy at each step.
49 with set_exploration_type(ExplorationType.MEAN), torch.no_grad():
50 # execute a rollout with the trained policy
---> 51 eval_rollout = env.rollout(1000, policy_module)
52 logs["eval reward"].append(eval_rollout["next", "reward"].mean().item())
53 logs["eval reward (sum)"].append(
54 eval_rollout["next", "reward"].sum().item()
55 )
File D:\Study\venv\Lib\site-packages\torchrl\envs\common.py:2635, in EnvBase.rollout(self, max_steps, policy, callback, auto_reset, auto_cast_to_device, break_when_any_done, break_when_all_done, return_contiguous, tensordict, set_truncated, out, trust_policy)
2625 kwargs = {
2626 "tensordict": tensordict,
2627 "auto_cast_to_device": auto_cast_to_device,
(...)
2632 "callback": callback,
2633 }
2634 if break_when_any_done or break_when_all_done:
-> 2635 tensordicts = self._rollout_stop_early(
2636 break_when_all_done=break_when_all_done,
2637 break_when_any_done=break_when_any_done,
2638 **kwargs,
2639 )
2640 else:
2641 tensordicts = self._rollout_nonstop(**kwargs)
File D:\Study\venv\Lib\site-packages\torchrl\envs\common.py:2722, in EnvBase._rollout_stop_early(self, break_when_any_done, break_when_all_done, tensordict, auto_cast_to_device, max_steps, policy, policy_device, env_device, callback)
2719 else:
2720 tensordict.clear_device_()
-> 2722 tensordict = policy(tensordict)
2723 if auto_cast_to_device:
2724 if env_device is not None:
File D:\Study\venv\Lib\site-packages\torch\nn\modules\module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)
File D:\Study\venv\Lib\site-packages\torch\nn\modules\module.py:1747, in Module._call_impl(self, *args, **kwargs)
1742 # If we don't have any hooks, we want to skip the rest of the logic in
1743 # this function, and just call forward.
1744 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1749 result = None
1750 called_always_called_hooks = set()
File D:\Study\venv\Lib\site-packages\tensordict\nn\common.py:314, in dispatch.__call__.<locals>.wrapper(*args, **kwargs)
311 return out[0] if len(out) == 1 else out
313 if _self is not None:
--> 314 return func(_self, tensordict, *args, **kwargs)
315 return func(tensordict, *args, **kwargs)
File D:\Study\venv\Lib\site-packages\tensordict\nn\utils.py:359, in _set_skip_existing_None.__call__.<locals>.wrapper(_self, tensordict, *args, **kwargs)
357 self.prev = _SKIP_EXISTING
358 try:
--> 359 result = func(_self, tensordict, *args, **kwargs)
360 finally:
361 _SKIP_EXISTING = self.prev
File D:\Study\venv\Lib\site-packages\tensordict\nn\probabilistic.py:622, in ProbabilisticTensorDictSequential.forward(self, tensordict, tensordict_out, **kwargs)
613 @dispatch(auto_batch_size=False)
614 @_set_skip_existing_None()
615 def forward(
(...)
619 **kwargs,
620 ) -> TensorDictBase:
621 tensordict_out = self.get_dist_params(tensordict, tensordict_out, **kwargs)
--> 622 return self.module[-1](tensordict_out, _requires_sample=self._requires_sample)
File D:\Study\venv\Lib\site-packages\torch\nn\modules\module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)
File D:\Study\venv\Lib\site-packages\torch\nn\modules\module.py:1747, in Module._call_impl(self, *args, **kwargs)
1742 # If we don't have any hooks, we want to skip the rest of the logic in
1743 # this function, and just call forward.
1744 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1749 result = None
1750 called_always_called_hooks = set()
File D:\Study\venv\Lib\site-packages\tensordict\nn\common.py:314, in dispatch.__call__.<locals>.wrapper(*args, **kwargs)
311 return out[0] if len(out) == 1 else out
313 if _self is not None:
--> 314 return func(_self, tensordict, *args, **kwargs)
315 return func(tensordict, *args, **kwargs)
File D:\Study\venv\Lib\site-packages\tensordict\nn\utils.py:359, in _set_skip_existing_None.__call__.<locals>.wrapper(_self, tensordict, *args, **kwargs)
357 self.prev = _SKIP_EXISTING
358 try:
--> 359 result = func(_self, tensordict, *args, **kwargs)
360 finally:
361 _SKIP_EXISTING = self.prev
File D:\Study\venv\Lib\site-packages\tensordict\nn\probabilistic.py:393, in ProbabilisticTensorDictModule.forward(self, tensordict, tensordict_out, _requires_sample)
391 dist = self.get_dist(tensordict)
392 if _requires_sample:
--> 393 out_tensors = self._dist_sample(dist, interaction_type=interaction_type())
394 if isinstance(out_tensors, TensorDictBase):
395 if self.return_log_prob:
File D:\Study\venv\Lib\site-packages\tensordict\nn\probabilistic.py:490, in ProbabilisticTensorDictModule._dist_sample(self, dist, interaction_type)
485 raise NotImplementedError(
486 f"method {type(dist)}.median is not implemented"
487 )
489 elif interaction_type is InteractionType.MEAN:
--> 490 if hasattr(dist, "mean"):
491 try:
492 return dist.mean
File D:\Study\venv\Lib\site-packages\torchrl\modules\distributions\continuous.py:551, in TanhNormal.mean(self)
549 @property
550 def mean(self):
--> 551 raise NotImplementedError(
552 f"{type(self).__name__} does not have a closed form formula for the average. "
553 "Am estimate of this value can be computed using dist.sample((N,)).mean(dim=0), "
554 "where N is a large number of samples."
555 )
NotImplementedError: TanhNormal does not have a closed form formula for the average. Am estimate of this value can be computed using dist.sample((N,)).mean(dim=0), where N is a large number of samples.
Suggest a potential alternative/fix
No response
Metadata
Metadata
Assignees
Labels
No labels