diff --git a/advanced_source/coding_ddpg.py b/advanced_source/coding_ddpg.py index 0aefc716931..7dd3acf238d 100644 --- a/advanced_source/coding_ddpg.py +++ b/advanced_source/coding_ddpg.py @@ -65,26 +65,33 @@ # sphinx_gallery_start_ignore import warnings + warnings.filterwarnings("ignore") -import multiprocessing +from torch import multiprocessing + # TorchRL prefers spawn method, that restricts creation of ``~torchrl.envs.ParallelEnv`` inside # `__main__` method call, but for the easy of reading the code switch to fork # which is also a default spawn method in Google's Colaboratory try: multiprocessing.set_start_method("fork") except RuntimeError: - assert multiprocessing.get_start_method() == "fork" + pass + # sphinx_gallery_end_ignore -import torchrl import torch import tqdm -from typing import Tuple + ############################################################################### # We will execute the policy on CUDA if available -device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") +is_fork = multiprocessing.get_start_method() == "fork" +device = ( + torch.device(0) + if torch.cuda.is_available() and not is_fork + else torch.device("cpu") +) collector_device = torch.device("cpu") # Change the device to ``cuda`` to use CUDA ############################################################################### @@ -244,23 +251,18 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): hp.update(hyperparams) value_key = "state_action_value" if value_type == ValueEstimators.TD1: - self._value_estimator = TD1Estimator( - value_network=self.actor_critic, value_key=value_key, **hp - ) + self._value_estimator = TD1Estimator(value_network=self.actor_critic, **hp) elif value_type == ValueEstimators.TD0: - self._value_estimator = TD0Estimator( - value_network=self.actor_critic, value_key=value_key, **hp - ) + self._value_estimator = TD0Estimator(value_network=self.actor_critic, **hp) elif value_type == ValueEstimators.GAE: raise NotImplementedError( f"Value type {value_type} it not implemented for loss {type(self)}." ) elif value_type == ValueEstimators.TDLambda: - self._value_estimator = TDLambdaEstimator( - value_network=self.actor_critic, value_key=value_key, **hp - ) + self._value_estimator = TDLambdaEstimator(value_network=self.actor_critic, **hp) else: raise NotImplementedError(f"Unknown value type {value_type}") + self._value_estimator.set_keys(value=value_key) ############################################################################### @@ -311,7 +313,7 @@ def _loss_actor( def _loss_value( self, tensordict, -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: +): td_copy = tensordict.clone() # V(s, a) @@ -349,7 +351,7 @@ def _loss_value( # value and actor loss, collect the cost values and write them in a ``TensorDict`` # delivered to the user. -from tensordict.tensordict import TensorDict, TensorDictBase +from tensordict import TensorDict, TensorDictBase def _forward(self, input_tensordict: TensorDictBase) -> TensorDict: @@ -457,6 +459,7 @@ def make_env(from_pixels=False): raise NotImplementedError env_kwargs = { + "device": device, "from_pixels": from_pixels, "pixels_only": from_pixels, "frame_skip": 2, @@ -519,16 +522,6 @@ def make_transformed_env( # syntax. env.append_transform(RewardScaling(loc=0.0, scale=reward_scaling)) - double_to_float_list = [] - double_to_float_inv_list = [] - if env_library is DMControlEnv: - # ``DMControl`` requires double-precision - double_to_float_list += [ - "reward", - "action", - ] - double_to_float_inv_list += ["action"] - # We concatenate all states into a single "observation_vector" # even if there is a single tensor, it'll be renamed in "observation_vector". # This facilitates the downstream operations as we know the name of the @@ -544,12 +537,7 @@ def make_transformed_env( # version of the transform env.append_transform(ObservationNorm(in_keys=[out_key], standard_normal=True)) - double_to_float_list.append(out_key) - env.append_transform( - DoubleToFloat( - in_keys=double_to_float_list, in_keys_inv=double_to_float_inv_list - ) - ) + env.append_transform(DoubleToFloat()) env.append_transform(StepCounter(max_frames_per_traj)) @@ -874,9 +862,6 @@ def make_ddpg_actor( reset_at_each_iter=False, split_trajs=False, device=collector_device, - # device for execution - storing_device=collector_device, - # device where data will be stored and passed exploration_type=ExplorationType.RANDOM, ) diff --git a/advanced_source/pendulum.py b/advanced_source/pendulum.py index 9f7a7f75626..38524cfff40 100644 --- a/advanced_source/pendulum.py +++ b/advanced_source/pendulum.py @@ -10,7 +10,7 @@ is an integrative part of reinforcement learning and control engineering. TorchRL provides a set of tools to do this in multiple contexts. -This tutorial demonstrates how to use PyTorch and TorchRL code a pendulum +This tutorial demonstrates how to use PyTorch and TorchRL code a pendulum simulator from the ground up. It is freely inspired by the Pendulum-v1 implementation from `OpenAI-Gym/Farama-Gymnasium control library `__. @@ -49,9 +49,9 @@ # cover a broader range of features of the environment API in TorchRL. # # Modeling stateless environments gives users full control over the input and -# outputs of the simulator: one can reset an experiment at any stage or actively -# modify the dynamics from the outside. However, it assumes that we have some control -# over a task, which may not always be the case: solving a problem where we cannot +# outputs of the simulator: one can reset an experiment at any stage or actively +# modify the dynamics from the outside. However, it assumes that we have some control +# over a task, which may not always be the case: solving a problem where we cannot # control the current state is more challenging but has a much wider set of applications. # # Another advantage of stateless environments is that they can enable @@ -73,14 +73,31 @@ # simulation graph. # * Finally, we will train a simple policy to solve the system we implemented. # + +# sphinx_gallery_start_ignore +import warnings + +warnings.filterwarnings("ignore") +from torch import multiprocessing + +# TorchRL prefers spawn method, that restricts creation of ``~torchrl.envs.ParallelEnv`` inside +# `__main__` method call, but for the easy of reading the code switch to fork +# which is also a default spawn method in Google's Colaboratory +try: + multiprocessing.set_start_method("fork") +except RuntimeError: + pass + +# sphinx_gallery_end_ignore + from collections import defaultdict from typing import Optional import numpy as np import torch import tqdm +from tensordict import TensorDict, TensorDictBase from tensordict.nn import TensorDictModule -from tensordict.tensordict import TensorDict, TensorDictBase from torch import nn from torchrl.data import BoundedTensorSpec, CompositeSpec, UnboundedContinuousTensorSpec @@ -167,7 +184,7 @@ # of :meth:`~torchrl.envs.EnvBase.step` in the input ``tensordict`` to enforce # input/output consistency. # -# Typically, for stateful environments, this will look like this: +# Typically, for stateful environments, this will look like this: # # .. code-block:: # @@ -221,6 +238,7 @@ # needed as the state needs to be read from the environment. # + def _step(tensordict): th, thdot = tensordict["th"], tensordict["thdot"] # th := theta @@ -896,7 +914,7 @@ def plot(): ###################################################################### # Conclusion # ---------- -# +# # In this tutorial, we have learned how to code a stateless environment from # scratch. We touched the subjects of: # diff --git a/intermediate_source/dqn_with_rnn_tutorial.py b/intermediate_source/dqn_with_rnn_tutorial.py index 55afbbe5e45..8135f07cd3f 100644 --- a/intermediate_source/dqn_with_rnn_tutorial.py +++ b/intermediate_source/dqn_with_rnn_tutorial.py @@ -68,6 +68,22 @@ # ----- # +# sphinx_gallery_start_ignore +import warnings + +warnings.filterwarnings("ignore") +from torch import multiprocessing + +# TorchRL prefers spawn method, that restricts creation of ``~torchrl.envs.ParallelEnv`` inside +# `__main__` method call, but for the easy of reading the code switch to fork +# which is also a default spawn method in Google's Colaboratory +try: + multiprocessing.set_start_method("fork") +except RuntimeError: + pass + +# sphinx_gallery_end_ignore + import torch import tqdm from tensordict.nn import TensorDictModule as Mod, TensorDictSequential as Seq @@ -88,10 +104,15 @@ TransformedEnv, ) from torchrl.envs.libs.gym import GymEnv -from torchrl.modules import ConvNet, EGreedyWrapper, LSTMModule, MLP, QValueModule +from torchrl.modules import ConvNet, EGreedyModule, LSTMModule, MLP, QValueModule from torchrl.objectives import DQNLoss, SoftUpdate -device = torch.device(0) if torch.cuda.device_count() else torch.device("cpu") +is_fork = multiprocessing.get_start_method() == "fork" +device = ( + torch.device(0) + if torch.cuda.is_available() and not is_fork + else torch.device("cpu") +) ###################################################################### # Environment @@ -293,11 +314,15 @@ # DQN being a deterministic algorithm, exploration is a crucial part of it. # We'll be using an :math:`\epsilon`-greedy policy with an epsilon of 0.2 decaying # progressively to 0. -# This decay is achieved via a call to :meth:`~torchrl.modules.EGreedyWrapper.step` +# This decay is achieved via a call to :meth:`~torchrl.modules.EGreedyModule.step` # (see training loop below). # -stoch_policy = EGreedyWrapper( - stoch_policy, annealing_num_steps=1_000_000, spec=env.action_spec, eps_init=0.2 +exploration_module = EGreedyModule( + annealing_num_steps=1_000_000, spec=env.action_spec, eps_init=0.2 +) +stoch_policy = Seq( + stoch_policy, + exploration_module, ) ###################################################################### @@ -362,7 +387,7 @@ # For the sake of efficiency, we're only running a few thousands iterations # here. In a real setting, the total number of frames should be set to 1M. # -collector = SyncDataCollector(env, stoch_policy, frames_per_batch=50, total_frames=200) +collector = SyncDataCollector(env, stoch_policy, frames_per_batch=50, total_frames=200, device=device) rb = TensorDictReplayBuffer( storage=LazyMemmapStorage(20_000), batch_size=4, prefetch=10 ) @@ -403,7 +428,7 @@ pbar.set_description( f"steps: {longest}, loss_val: {loss_vals['loss'].item(): 4.4f}, action_spread: {data['action'].sum(0)}" ) - stoch_policy.step(data.numel()) + exploration_module.step(data.numel()) updater.step() with set_exploration_type(ExplorationType.MODE), torch.no_grad(): diff --git a/intermediate_source/mario_rl_tutorial.py b/intermediate_source/mario_rl_tutorial.py index 8fe5b327d02..03d6396a47e 100755 --- a/intermediate_source/mario_rl_tutorial.py +++ b/intermediate_source/mario_rl_tutorial.py @@ -32,8 +32,8 @@ # # %%bash # pip install gym-super-mario-bros==7.4.0 -# pip install tensordict==0.2.0 -# pip install torchrl==0.2.0 +# pip install tensordict==0.3.0 +# pip install torchrl==0.3.0 # import torch diff --git a/intermediate_source/reinforcement_ppo.py b/intermediate_source/reinforcement_ppo.py index 1b75e21e51a..9f4bf155618 100644 --- a/intermediate_source/reinforcement_ppo.py +++ b/intermediate_source/reinforcement_ppo.py @@ -104,6 +104,22 @@ # description and more about the algorithm itself. # +# sphinx_gallery_start_ignore +import warnings + +warnings.filterwarnings("ignore") +from torch import multiprocessing + +# TorchRL prefers spawn method, that restricts creation of ``~torchrl.envs.ParallelEnv`` inside +# `__main__` method call, but for the easy of reading the code switch to fork +# which is also a default spawn method in Google's Colaboratory +try: + multiprocessing.set_start_method("fork") +except RuntimeError: + pass + +# sphinx_gallery_end_ignore + from collections import defaultdict import matplotlib.pyplot as plt @@ -118,7 +134,7 @@ from torchrl.envs import (Compose, DoubleToFloat, ObservationNorm, StepCounter, TransformedEnv) from torchrl.envs.libs.gym import GymEnv -from torchrl.envs.utils import check_env_specs, set_exploration_mode +from torchrl.envs.utils import check_env_specs, ExplorationType, set_exploration_type from torchrl.modules import ProbabilisticActor, TanhNormal, ValueOperator from torchrl.objectives import ClipPPOLoss from torchrl.objectives.value import GAE @@ -137,7 +153,12 @@ # actually return ``frame_skip`` frames). # -device = "cpu" if not torch.cuda.is_available() else "cuda:0" +is_fork = multiprocessing.get_start_method() == "fork" +device = ( + torch.device(0) + if torch.cuda.is_available() and not is_fork + else torch.device("cpu") +) num_cells = 256 # number of cells in each layer i.e. output dim. lr = 3e-4 max_grad_norm = 1.0 @@ -152,22 +173,10 @@ # use. In general, the goal of an RL algorithm is to learn to solve the task # as fast as it can in terms of environment interactions: the lower the ``total_frames`` # the better. -# We also define a ``frame_skip``: in some contexts, repeating the same action -# multiple times over the course of a trajectory may be beneficial as it makes -# the behavior more consistent and less erratic. However, "skipping" -# too many frames will hamper training by reducing the reactivity of the actor -# to observation changes. -# -# When using ``frame_skip`` it is good practice to -# correct the other frame counts by the number of frames we are grouping -# together. If we configure a total count of X frames for training but -# use a ``frame_skip`` of Y, we will be actually collecting ``XY`` frames in total -# which exceeds our predefined budget. -# -frame_skip = 1 -frames_per_batch = 1000 // frame_skip +# +frames_per_batch = 1000 # For a complete training, bring the number of frames up to 1M -total_frames = 50_000 // frame_skip +total_frames = 50_000 ###################################################################### # PPO parameters @@ -196,14 +205,14 @@ # # In RL, an *environment* is usually the way we refer to a simulator or a # control system. Various libraries provide simulation environments for reinforcement -# learning, including Gymnasium (previously OpenAI Gym), DeepMind Control Suite, and +# learning, including Gymnasium (previously OpenAI Gym), DeepMind control suite, and # many others. # As a general library, TorchRL's goal is to provide an interchangeable interface # to a large panel of RL simulators, allowing you to easily swap one environment # with another. For example, creating a wrapped gym environment can be achieved with few characters: # -base_env = GymEnv("InvertedDoublePendulum-v4", device=device, frame_skip=frame_skip) +base_env = GymEnv("InvertedDoublePendulum-v4", device=device) ###################################################################### # There are a few things to notice in this code: first, we created @@ -262,7 +271,7 @@ Compose( # normalize observations ObservationNorm(in_keys=["observation"]), - DoubleToFloat(in_keys=["observation"]), + DoubleToFloat(), StepCounter(), ), ) @@ -410,8 +419,8 @@ in_keys=["loc", "scale"], distribution_class=TanhNormal, distribution_kwargs={ - "min": env.action_spec.space.minimum, - "max": env.action_spec.space.maximum, + "min": env.action_spec.space.low, + "max": env.action_spec.space.high, }, return_log_prob=True, # we'll need the log-prob for the numerator of the importance weights @@ -514,7 +523,7 @@ # replay_buffer = ReplayBuffer( - storage=LazyTensorStorage(frames_per_batch), + storage=LazyTensorStorage(max_size=frames_per_batch), sampler=SamplerWithoutReplacement(), ) @@ -546,16 +555,13 @@ ) loss_module = ClipPPOLoss( - actor=policy_module, - critic=value_module, - advantage_key="advantage", + actor_network=policy_module, + critic_network=value_module, clip_epsilon=clip_epsilon, entropy_bonus=bool(entropy_eps), entropy_coef=entropy_eps, # these keys match by default but we set this for completeness - value_target_key=advantage_module.value_target_key, critic_coef=1.0, - gamma=0.99, loss_critic_type="smooth_l1", ) @@ -586,7 +592,7 @@ logs = defaultdict(list) -pbar = tqdm(total=total_frames * frame_skip) +pbar = tqdm(total=total_frames) eval_str = "" # We iterate over the collector until it reaches the total number of frames it was @@ -618,7 +624,7 @@ optim.zero_grad() logs["reward"].append(tensordict_data["next", "reward"].mean().item()) - pbar.update(tensordict_data.numel() * frame_skip) + pbar.update(tensordict_data.numel()) cum_reward_str = ( f"average reward={logs['reward'][-1]: 4.4f} (init={logs['reward'][0]: 4.4f})" ) @@ -633,7 +639,7 @@ # number of steps (1000, which is our ``env`` horizon). # The ``rollout`` method of the ``env`` can take a policy as argument: # it will then execute this policy at each step. - with set_exploration_mode("mean"), torch.no_grad(): + with set_exploration_type(ExplorationType.MEAN), torch.no_grad(): # execute a rollout with the trained policy eval_rollout = env.rollout(1000, policy_module) logs["eval reward"].append(eval_rollout["next", "reward"].mean().item()) diff --git a/requirements.txt b/requirements.txt index 649af9b9b3f..918cb86d730 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,8 +25,8 @@ tensorboard jinja2==3.1.3 pytorch-lightning torchx -torchrl==0.2.1 -tensordict==0.2.1 +torchrl==0.3.0 +tensordict==0.3.0 ax-platform nbformat>=4.2.0 datasets