diff --git a/docs/source/index.rst b/docs/source/index.rst index 0e034ef2351..2eedc045416 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -22,15 +22,15 @@ TorchRL provides pytorch and python-first, low and high level abstractions for R The code is aimed at supporting research in RL. Most of it is written in python in a highly modular way, such that researchers can easily swap components, transform them or write new ones with little effort. This repo attempts to align with the existing pytorch ecosystem libraries in that it has a "dataset pillar" -:doc:`(environments) `, -:ref:`transforms `, -:doc:`models `, +:ref:`(environments) `, +:ref:`transforms `, +:ref:`models `, data utilities (e.g. collectors and containers), etc. TorchRL aims at having as few dependencies as possible (python standard library, numpy and pytorch). Common environment libraries (e.g. OpenAI gym) are only optional. On the low-level end, torchrl comes with a set of highly re-usable functionals -for :doc:`cost functions `, :ref:`returns ` and data processing. +for :ref:`cost functions `, :ref:`returns ` and data processing. TorchRL aims at a high modularity and good runtime performance. diff --git a/docs/source/reference/data.rst b/docs/source/reference/data.rst index c95edd6156f..852592992b9 100644 --- a/docs/source/reference/data.rst +++ b/docs/source/reference/data.rst @@ -944,7 +944,7 @@ not predictable. MultiCategorical MultiOneHot NonTensor - OneHotDiscrete + OneHot Stacked StackedComposite Unbounded @@ -1050,7 +1050,7 @@ and the tree can be expanded for each of these. The following figure shows how t BinaryToDecimal HashToInt - MCTSForeset + MCTSForest QueryModule RandomProjectionHash SipHash diff --git a/docs/source/reference/objectives.rst b/docs/source/reference/objectives.rst index 76066e35c4e..9e7df1bff8f 100644 --- a/docs/source/reference/objectives.rst +++ b/docs/source/reference/objectives.rst @@ -274,6 +274,9 @@ QMixer Returns ------- + +.. _ref_returns: + .. currentmodule:: torchrl.objectives.value .. autosummary:: diff --git a/sota-implementations/a2c/README.md b/sota-implementations/a2c/README.md index 513e6d70811..b3c651b7d4e 100644 --- a/sota-implementations/a2c/README.md +++ b/sota-implementations/a2c/README.md @@ -19,11 +19,21 @@ Please note that each example is independent of each other for the sake of simpl You can execute the A2C algorithm on Atari environments by running the following command: ```bash -python a2c_atari.py +python a2c_atari.py compile.compile=1 compile.cudagraphs=1 ``` + You can execute the A2C algorithm on MuJoCo environments by running the following command: ```bash -python a2c_mujoco.py +python a2c_mujoco.py compile.compile=1 compile.cudagraphs=1 ``` + +## Runtimes + +Runtimes when executed on H100: + +| Environment | Eager | Compile | Compile+cudagraphs | +|-------------|-------|---------|--------------------| +| MUJOCO | | | | +| ATARI | | 60 mins | 43 mins | diff --git a/sota-implementations/decision_transformer/lamb.py b/sota-implementations/decision_transformer/lamb.py index 7f874b6e049..69468d1ad86 100644 --- a/sota-implementations/decision_transformer/lamb.py +++ b/sota-implementations/decision_transformer/lamb.py @@ -15,15 +15,15 @@ class Lamb(Optimizer): LAMB was proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_. Arguments: params (iterable): iterable of parameters to optimize or dicts defining parameter groups. - lr (float, optional): learning rate. (default: 1e-3) + lr (:obj:`float`, optional): learning rate. (default: 1e-3) betas (Tuple[float, float], optional): coefficients used for computing running averages of gradient and its norm. (default: (0.9, 0.999)) - eps (float, optional): term added to the denominator to improve + eps (:obj:`float`, optional): term added to the denominator to improve numerical stability. (default: 1e-8) - weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + weight_decay (:obj:`float`, optional): weight decay (L2 penalty) (default: 0) grad_averaging (bool, optional): whether apply (1-beta2) to grad when calculating running averages of gradient. (default: True) - max_grad_norm (float, optional): value used to clip global grad norm (default: 1.0) + max_grad_norm (:obj:`float`, optional): value used to clip global grad norm (default: 1.0) trust_clip (bool): enable LAMBC trust ratio clipping (default: False) always_adapt (boolean, optional): Apply adaptive learning rate to 0.0 weight decay parameter (default: False) diff --git a/test/test_collector.py b/test/test_collector.py index 1309254ce2d..7c185830a92 100644 --- a/test/test_collector.py +++ b/test/test_collector.py @@ -3172,6 +3172,28 @@ def make_and_test_policy( ) +@pytest.mark.parametrize( + "ctype", [SyncDataCollector, MultiaSyncDataCollector, MultiSyncDataCollector] +) +def test_no_stopiteration(ctype): + # Tests that there is no StopIteration raised and that the length of the collector is properly set + if ctype is SyncDataCollector: + envs = SerialEnv(16, CountingEnv) + else: + envs = [SerialEnv(8, CountingEnv), SerialEnv(8, CountingEnv)] + + collector = ctype(create_env_fn=envs, frames_per_batch=173, total_frames=300) + try: + c_iter = iter(collector) + for i in range(len(collector)): # noqa: B007 + c = next(c_iter) + assert c is not None + assert i == 1 + finally: + collector.shutdown() + del collector + + if __name__ == "__main__": args, unknown = argparse.ArgumentParser().parse_known_args() pytest.main([__file__, "--capture", "no", "--exitfirst"] + unknown) diff --git a/torchrl/collectors/collectors.py b/torchrl/collectors/collectors.py index f40b6ebc20e..a58243ee673 100644 --- a/torchrl/collectors/collectors.py +++ b/torchrl/collectors/collectors.py @@ -147,6 +147,7 @@ class DataCollectorBase(IterableDataset, metaclass=abc.ABCMeta): _iterator = None total_frames: int frames_per_batch: int + requested_frames_per_batch: int trust_policy: bool compiled_policy: bool cudagraphed_policy: bool @@ -305,7 +306,7 @@ def __class_getitem__(self, index): def __len__(self) -> int: if self.total_frames > 0: - return -(self.total_frames // -self.frames_per_batch) + return -(self.total_frames // -self.requested_frames_per_batch) raise RuntimeError("Non-terminating collectors do not have a length") @@ -700,7 +701,7 @@ def __init__( remainder = total_frames % frames_per_batch if remainder != 0 and RL_WARNINGS: warnings.warn( - f"total_frames ({total_frames}) is not exactly divisible by frames_per_batch ({frames_per_batch})." + f"total_frames ({total_frames}) is not exactly divisible by frames_per_batch ({frames_per_batch}). " f"This means {frames_per_batch - remainder} additional frames will be collected." "To silence this message, set the environment variable RL_WARNINGS to False." ) @@ -1399,11 +1400,13 @@ class _MultiDataCollector(DataCollectorBase): instances) it will be wrapped in a `nn.Module` first. Then, the collector will try to assess if these modules require wrapping in a :class:`~tensordict.nn.TensorDictModule` or not. + - If the policy forward signature matches any of ``forward(self, tensordict)``, ``forward(self, td)`` or ``forward(self, : TensorDictBase)`` (or any typing with a single argument typed as a subclass of ``TensorDictBase``) then the policy won't be wrapped in a :class:`~tensordict.nn.TensorDictModule`. - - In all other cases an attempt to wrap it will be undergone as such: ``TensorDictModule(policy, in_keys=env_obs_key, out_keys=env.action_keys)``. + - In all other cases an attempt to wrap it will be undergone as such: + ``TensorDictModule(policy, in_keys=env_obs_key, out_keys=env.action_keys)``. Keyword Args: frames_per_batch (int): A keyword-only argument representing the @@ -1491,7 +1494,7 @@ class _MultiDataCollector(DataCollectorBase): update_at_each_batch (boolm optional): if ``True``, :meth:`~.update_policy_weight_()` will be called before (sync) or after (async) each data collection. Defaults to ``False``. - preemptive_threshold (float, optional): a value between 0.0 and 1.0 that specifies the ratio of workers + preemptive_threshold (:obj:`float`, optional): a value between 0.0 and 1.0 that specifies the ratio of workers that will be allowed to finished collecting their rollout before the rest are forced to end early. num_threads (int, optional): number of threads for this process. Defaults to the number of workers. @@ -2108,11 +2111,13 @@ class MultiSyncDataCollector(_MultiDataCollector): trajectory and the start of the next collection. This class can be safely used with online RL sota-implementations. - .. note:: Python requires multiprocessed code to be instantiated within a main guard: + .. note:: + Python requires multiprocessed code to be instantiated within a main guard: >>> from torchrl.collectors import MultiSyncDataCollector >>> if __name__ == "__main__": ... # Create your collector here + ... collector = MultiSyncDataCollector(...) See https://docs.python.org/3/library/multiprocessing.html for more info. @@ -2140,8 +2145,8 @@ class MultiSyncDataCollector(_MultiDataCollector): ... if i == 2: ... print(data) ... break - ... collector.shutdown() - ... del collector + >>> collector>shutdown() + >>> del collector TensorDict( fields={ action: Tensor(shape=torch.Size([200, 1]), device=cpu, dtype=torch.float32, is_shared=False), @@ -2768,7 +2773,7 @@ class aSyncDataCollector(MultiaSyncDataCollector): update_at_each_batch (boolm optional): if ``True``, :meth:`~.update_policy_weight_()` will be called before (sync) or after (async) each data collection. Defaults to ``False``. - preemptive_threshold (float, optional): a value between 0.0 and 1.0 that specifies the ratio of workers + preemptive_threshold (:obj:`float`, optional): a value between 0.0 and 1.0 that specifies the ratio of workers that will be allowed to finished collecting their rollout before the rest are forced to end early. num_threads (int, optional): number of threads for this process. Defaults to the number of workers. diff --git a/torchrl/data/map/tree.py b/torchrl/data/map/tree.py index efbf6f79553..645f7704ddd 100644 --- a/torchrl/data/map/tree.py +++ b/torchrl/data/map/tree.py @@ -48,7 +48,7 @@ class Tree(TensorClass["nocast"]): If there are multiple actions taken at this node, subtrees are stored in the corresponding entry. Rollouts can be reconstructed using the :meth:`~.rollout_from_path` method. node (TensorDict): Data defining this node (e.g., observations) before the next branching. - Entries usually matches the ``in_keys`` in ``MCTSForeset.node_map``. + Entries usually matches the ``in_keys`` in ``MCTSForest.node_map``. subtree (Tree): A stack of subtrees produced when actions are taken. num_children (int): The number of child nodes (read-only). is_terminal (bool): whether the tree has children nodes (read-only). diff --git a/torchrl/data/postprocs/postprocs.py b/torchrl/data/postprocs/postprocs.py index a46b1c04ecc..a471a7b4d5f 100644 --- a/torchrl/data/postprocs/postprocs.py +++ b/torchrl/data/postprocs/postprocs.py @@ -90,7 +90,7 @@ class MultiStep(nn.Module): It is an identity transform whenever :attr:`n_steps` is 0. Args: - gamma (float): Discount factor for return computation + gamma (:obj:`float`): Discount factor for return computation n_steps (integer): maximum look-ahead steps. .. note:: This class is meant to be used within a ``DataCollector``. diff --git a/torchrl/data/replay_buffers/replay_buffers.py b/torchrl/data/replay_buffers/replay_buffers.py index 5e7b80d7bed..67113095af0 100644 --- a/torchrl/data/replay_buffers/replay_buffers.py +++ b/torchrl/data/replay_buffers/replay_buffers.py @@ -897,16 +897,14 @@ class PrioritizedReplayBuffer(ReplayBuffer): All arguments are keyword-only arguments. - Presented in - "Schaul, T.; Quan, J.; Antonoglou, I.; and Silver, D. 2015. - Prioritized experience replay." - (https://arxiv.org/abs/1511.05952) + Presented in "Schaul, T.; Quan, J.; Antonoglou, I.; and Silver, D. 2015. + Prioritized experience replay." (https://arxiv.org/abs/1511.05952) Args: - alpha (float): exponent α determines how much prioritization is used, + alpha (:obj:`float`): exponent α determines how much prioritization is used, with α = 0 corresponding to the uniform case. - beta (float): importance sampling negative exponent. - eps (float): delta added to the priorities to ensure that the buffer + beta (:obj:`float`): importance sampling negative exponent. + eps (:obj:`float`): delta added to the priorities to ensure that the buffer does not contain null priorities. storage (Storage, optional): the storage to be used. If none is provided a default :class:`~torchrl.data.replay_buffers.ListStorage` with @@ -1366,10 +1364,10 @@ class TensorDictPrioritizedReplayBuffer(TensorDictReplayBuffer): tensordict to be passed to it with its new priority value. Keyword Args: - alpha (float): exponent α determines how much prioritization is used, + alpha (:obj:`float`): exponent α determines how much prioritization is used, with α = 0 corresponding to the uniform case. - beta (float): importance sampling negative exponent. - eps (float): delta added to the priorities to ensure that the buffer + beta (:obj:`float`): importance sampling negative exponent. + eps (:obj:`float`): delta added to the priorities to ensure that the buffer does not contain null priorities. storage (Storage, optional): the storage to be used. If none is provided a default :class:`~torchrl.data.replay_buffers.ListStorage` with diff --git a/torchrl/data/replay_buffers/samplers.py b/torchrl/data/replay_buffers/samplers.py index a0f84c0c299..51b84029766 100644 --- a/torchrl/data/replay_buffers/samplers.py +++ b/torchrl/data/replay_buffers/samplers.py @@ -298,10 +298,10 @@ class PrioritizedSampler(Sampler): Args: max_capacity (int): maximum capacity of the buffer. - alpha (float): exponent α determines how much prioritization is used, + alpha (:obj:`float`): exponent α determines how much prioritization is used, with α = 0 corresponding to the uniform case. - beta (float): importance sampling negative exponent. - eps (float, optional): delta added to the priorities to ensure that the buffer + beta (:obj:`float`): importance sampling negative exponent. + eps (:obj:`float`, optional): delta added to the priorities to ensure that the buffer does not contain null priorities. Defaults to 1e-8. reduction (str, optional): the reduction method for multidimensional tensordicts (ie stored trajectory). Can be one of "max", "min", @@ -1652,10 +1652,10 @@ class PrioritizedSliceSampler(SliceSampler, PrioritizedSampler): :meth:`~.update_priority`. Args: - alpha (float): exponent α determines how much prioritization is used, + alpha (:obj:`float`): exponent α determines how much prioritization is used, with α = 0 corresponding to the uniform case. - beta (float): importance sampling negative exponent. - eps (float, optional): delta added to the priorities to ensure that the buffer + beta (:obj:`float`): importance sampling negative exponent. + eps (:obj:`float`, optional): delta added to the priorities to ensure that the buffer does not contain null priorities. Defaults to 1e-8. reduction (str, optional): the reduction method for multidimensional tensordicts (i.e., stored trajectory). Can be one of "max", "min", diff --git a/torchrl/data/rlhf/utils.py b/torchrl/data/rlhf/utils.py index a4ccbfd8a1b..775eb652c4b 100644 --- a/torchrl/data/rlhf/utils.py +++ b/torchrl/data/rlhf/utils.py @@ -41,7 +41,7 @@ class ConstantKLController(KLControllerBase): with. Keyword Arguments: - kl_coef (float): The coefficient to multiply KL with when calculating the + kl_coef (:obj:`float`): The coefficient to multiply KL with when calculating the reward. model (nn.Module, optional): wrapped model that needs to be controlled. Must have an attribute ``"kl_coef"``. If provided, the ``"kl_coef"`` will @@ -73,8 +73,8 @@ class AdaptiveKLController(KLControllerBase): """Adaptive KL Controller as described in Ziegler et al. "Fine-Tuning Language Models from Human Preferences". Keyword Arguments: - init_kl_coef (float): The starting value of the coefficient. - target (float): The target KL value. When the observed KL is smaller, the + init_kl_coef (:obj:`float`): The starting value of the coefficient. + target (:obj:`float`): The target KL value. When the observed KL is smaller, the coefficient is decreased, thereby relaxing the KL penalty in the training objective and allowing the model to stray further from the reference model. When the observed KL is greater than the target, the KL coefficient is @@ -146,10 +146,10 @@ class RolloutFromModel: reward_model: (nn.Module, tensordict.nn.TensorDictModule): a model which, given ``input_ids`` and ``attention_mask``, calculates rewards for each token and end_scores (the reward for the final token in each sequence). - kl_coef: (float, optional): initial kl coefficient. + kl_coef: (:obj:`float`, optional): initial kl coefficient. max_new_tokens (int, optional): the maximum length of the sequence. Defaults to 50. - score_clip (float, optional): Scores from the reward model are clipped to the + score_clip (:obj:`float`, optional): Scores from the reward model are clipped to the range ``(-score_clip, score_clip)``. Defaults to 10. kl_scheduler (KLControllerBase, optional): the KL coefficient scheduler. num_steps (int, optional): number of steps between two optimization. diff --git a/torchrl/envs/common.py b/torchrl/envs/common.py index 1c473d31297..b6755212f98 100644 --- a/torchrl/envs/common.py +++ b/torchrl/envs/common.py @@ -840,15 +840,15 @@ def full_action_spec(self) -> Composite: ... break >>> env = BraxEnv(envname) >>> env.full_action_spec - Composite( - action: BoundedContinuous( - shape=torch.Size([8]), - space=ContinuousBox( - low=Tensor(shape=torch.Size([8]), device=cpu, dtype=torch.float32, contiguous=True), - high=Tensor(shape=torch.Size([8]), device=cpu, dtype=torch.float32, contiguous=True)), - device=cpu, - dtype=torch.float32, - domain=continuous), device=cpu, shape=torch.Size([])) + Composite( + action: BoundedContinuous( + shape=torch.Size([8]), + space=ContinuousBox( + low=Tensor(shape=torch.Size([8]), device=cpu, dtype=torch.float32, contiguous=True), + high=Tensor(shape=torch.Size([8]), device=cpu, dtype=torch.float32, contiguous=True)), + device=cpu, + dtype=torch.float32, + domain=continuous), device=cpu, shape=torch.Size([])) """ full_action_spec = self.input_spec.get("full_action_spec", None) @@ -1791,7 +1791,7 @@ def register_gym( (results are tensors). This arg can be passed during a call to :func:`~gym.make` (see example below). - reward_threshold (float, optional): [Gym kwarg] The reward threshold + reward_threshold (:obj:`float`, optional): [Gym kwarg] The reward threshold considered to have learnt an environment. nondeterministic (bool, optional): [Gym kwarg If the environment is nondeterministic (even with knowledge of the initial seed and all actions). Defaults to diff --git a/torchrl/envs/custom/tictactoeenv.py b/torchrl/envs/custom/tictactoeenv.py index 2c93a5748ef..0a464a3e390 100644 --- a/torchrl/envs/custom/tictactoeenv.py +++ b/torchrl/envs/custom/tictactoeenv.py @@ -34,6 +34,7 @@ class TicTacToeEnv(EnvBase): output entry). Specs: + >>> print(env.specs) Composite( output_spec: Composite( full_observation_spec: Composite( diff --git a/torchrl/envs/transforms/rb_transforms.py b/torchrl/envs/transforms/rb_transforms.py index 2a3cd8ec319..76a8e6039f8 100644 --- a/torchrl/envs/transforms/rb_transforms.py +++ b/torchrl/envs/transforms/rb_transforms.py @@ -36,7 +36,7 @@ class MultiStepTransform(Transform): n_steps (int): Number of steps in multi-step. The number of steps can be dynamically changed by changing the ``n_steps`` attribute of this transform. - gamma (float): Discount factor. + gamma (:obj:`float`): Discount factor. Keyword Args: reward_keys (list of NestedKey, optional): the reward keys in the input tensordict. diff --git a/torchrl/envs/transforms/rlhf.py b/torchrl/envs/transforms/rlhf.py index 6228b0f22b7..ab469ecec13 100644 --- a/torchrl/envs/transforms/rlhf.py +++ b/torchrl/envs/transforms/rlhf.py @@ -25,7 +25,7 @@ class KLRewardTransform(Transform): have the following features: it must have a set of input (``in_keys``) and output keys (``out_keys``). It must have a ``get_dist`` method that outputs the distribution of the action. - coef (float): the coefficient of the KL term. Defaults to ``1.0``. + coef (:obj:`float`): the coefficient of the KL term. Defaults to ``1.0``. in_keys (str or list of str/tuples of str): the input key where the reward should be fetched. Defaults to ``"reward"``. out_keys (str or list of str/tuples of str): the output key where the diff --git a/torchrl/envs/transforms/transforms.py b/torchrl/envs/transforms/transforms.py index e05e1256c79..e02c88c5330 100644 --- a/torchrl/envs/transforms/transforms.py +++ b/torchrl/envs/transforms/transforms.py @@ -1534,7 +1534,7 @@ class TargetReturn(Transform): reward achieved at each step or remains constant. Args: - target_return (float): target return to be achieved by the agent. + target_return (:obj:`float`): target return to be achieved by the agent. mode (str): mode to be used to update the target return. Can be either "reduce" or "constant". Default: "reduce". in_keys (sequence of NestedKey, optional): keys pointing to the reward entries. Defaults to the reward keys of the parent env. @@ -2552,7 +2552,7 @@ class ObservationNorm(ObservationTransform): as it is done for standardization. Default is `False`. - eps (float, optional): epsilon increment for the scale in the ``standard_normal`` case. + eps (:obj:`float`, optional): epsilon increment for the scale in the ``standard_normal`` case. Defaults to ``1e-6`` if not recoverable directly from the scale dtype. Examples: @@ -2845,7 +2845,7 @@ class CatFrames(ObservationTransform): has to be written. Defaults to the value of `in_keys`. padding (str, optional): the padding method. One of ``"same"`` or ``"constant"``. Defaults to ``"same"``, ie. the first value is used for padding. - padding_value (float, optional): the value to use for padding if ``padding="constant"``. + padding_value (:obj:`float`, optional): the value to use for padding if ``padding="constant"``. Defaults to 0. as_inverse (bool, optional): if ``True``, the transform is applied as an inverse transform. Defaults to ``False``. reset_key (NestedKey, optional): the reset key to be used as partial @@ -6194,6 +6194,7 @@ class SelectTransform(Transform): keep_dones (bool, optional): if ``False``, the done keys must be provided if they should be kept. Defaults to ``True``. + Examples: >>> import gymnasium >>> from torchrl.envs import GymWrapper >>> env = TransformedEnv( diff --git a/torchrl/modules/__init__.py b/torchrl/modules/__init__.py index 8523a783676..4cb6366f817 100644 --- a/torchrl/modules/__init__.py +++ b/torchrl/modules/__init__.py @@ -23,6 +23,7 @@ ) from .models import ( BatchRenorm1d, + ConsistentDropout, ConsistentDropoutModule, Conv3dNet, ConvNet, diff --git a/torchrl/modules/distributions/discrete.py b/torchrl/modules/distributions/discrete.py index 168ab977836..bfb7bc48f3c 100644 --- a/torchrl/modules/distributions/discrete.py +++ b/torchrl/modules/distributions/discrete.py @@ -180,7 +180,7 @@ class MaskedCategorical(D.Categorical): in the distribution. Exclusive with ``indices``. indices (torch.Tensor): A dense index tensor representing which actions must be taken into account. Exclusive with ``mask``. - neg_inf (float, optional): The log-probability value allocated to + neg_inf (:obj:`float`, optional): The log-probability value allocated to invalid (out-of-mask) indices. Defaults to -inf. padding_value: The padding value in the mask tensor. When sparse_mask == True, the padding_value will be ignored. @@ -339,7 +339,7 @@ class MaskedOneHotCategorical(MaskedCategorical): in the distribution. Exclusive with ``indices``. indices (torch.Tensor): A dense index tensor representing which actions must be taken into account. Exclusive with ``mask``. - neg_inf (float, optional): The log-probability value allocated to + neg_inf (:obj:`float`, optional): The log-probability value allocated to invalid (out-of-mask) indices. Defaults to -inf. padding_value: The padding value in then mask tensor when sparse_mask == True, the padding_value will be ignored. @@ -351,6 +351,7 @@ class MaskedOneHotCategorical(MaskedCategorical): ``ReparamGradientStrategy.RelaxedOneHot`` will use :class:`torch.distributions.RelaxedOneHot` to sample from the distribution. + Examples: >>> torch.manual_seed(0) >>> logits = torch.randn(4) / 100 # almost equal probabilities >>> mask = torch.tensor([True, False, True, True]) diff --git a/torchrl/modules/models/__init__.py b/torchrl/modules/models/__init__.py index 90b9fadd747..35a060e8d69 100644 --- a/torchrl/modules/models/__init__.py +++ b/torchrl/modules/models/__init__.py @@ -10,6 +10,7 @@ from .decision_transformer import DecisionTransformer from .exploration import ( + ConsistentDropout, ConsistentDropoutModule, NoisyLazyLinear, NoisyLinear, diff --git a/torchrl/modules/models/batchrenorm.py b/torchrl/modules/models/batchrenorm.py index 41de0945f70..c5534568af7 100644 --- a/torchrl/modules/models/batchrenorm.py +++ b/torchrl/modules/models/batchrenorm.py @@ -22,13 +22,13 @@ class BatchRenorm1d(nn.Module): num_features (int): Number of features in the input tensor. Keyword Args: - momentum (float, optional): Momentum factor for computing the running mean and variance. + momentum (:obj:`float`, optional): Momentum factor for computing the running mean and variance. Defaults to ``0.01``. - eps (float, optional): Small value added to the variance to avoid division by zero. + eps (:obj:`float`, optional): Small value added to the variance to avoid division by zero. Defaults to ``1e-5``. - max_r (float, optional): Maximum value for the scaling factor r. + max_r (:obj:`float`, optional): Maximum value for the scaling factor r. Defaults to ``3.0``. - max_d (float, optional): Maximum value for the bias factor d. + max_d (:obj:`float`, optional): Maximum value for the bias factor d. Defaults to ``5.0``. warmup_steps (int, optional): Number of warm-up steps for the running mean and variance. Defaults to ``10000``. diff --git a/torchrl/modules/models/exploration.py b/torchrl/modules/models/exploration.py index d69a85fd685..54c451ebb82 100644 --- a/torchrl/modules/models/exploration.py +++ b/torchrl/modules/models/exploration.py @@ -262,11 +262,11 @@ class gSDEModule(nn.Module): outputs a distribution average. action_dim (int): the dimension of the action. state_dim (int): the state dimension. - sigma_init (float, optional): the initial value of the standard deviation. The + sigma_init (:obj:`float`, optional): the initial value of the standard deviation. The softplus non-linearity is used to map the log_sigma parameter to a positive value. Defaults to ``1.0``. - scale_min (float, optional): min value of the scale. Defaults to ``0.01``. - scale_max (float, optional): max value of the scale. Defaults to ``10.0``. + scale_min (:obj:`float`, optional): min value of the scale. Defaults to ``0.01``. + scale_max (:obj:`float`, optional): max value of the scale. Defaults to ``10.0``. learn_sigma (bool, optional): if ``True``, the value of the ``sigma`` variable will be included in the module parameters, making it learnable. Defaults to ``True``. @@ -432,11 +432,11 @@ class LazygSDEModule(LazyModuleMixin, gSDEModule): (or 1 if no ``sigma_init`` value is provided). Args: - sigma_init (float, optional): the initial value of the standard deviation. The + sigma_init (:obj:`float`, optional): the initial value of the standard deviation. The softplus non-linearity is used to map the log_sigma parameter to a positive value. Defaults to ``None`` (learned). - scale_min (float, optional): min value of the scale. Defaults to ``0.01``. - scale_max (float, optional): max value of the scale. Defaults to ``10.0``. + scale_min (:obj:`float`, optional): min value of the scale. Defaults to ``0.01``. + scale_max (:obj:`float`, optional): max value of the scale. Defaults to ``10.0``. learn_sigma (bool, optional): if ``True``, the value of the ``sigma`` variable will be included in the module parameters, making it learnable. Defaults to ``True``. @@ -557,7 +557,7 @@ class ConsistentDropout(_DropoutNd): this module. Args: - p (float, optional): Dropout probability. Defaults to ``0.5``. + p (:obj:`float`, optional): Dropout probability. Defaults to ``0.5``. .. seealso:: @@ -609,7 +609,7 @@ class ConsistentDropoutModule(TensorDictModuleBase): """A TensorDictModule wrapper for :class:`~ConsistentDropout`. Args: - p (float, optional): Dropout probability. Default: ``0.5``. + p (:obj:`float`, optional): Dropout probability. Default: ``0.5``. in_keys (NestedKey or list of NestedKeys): keys to be read from input tensordict and passed to this module. out_keys (NestedKey or iterable of NestedKeys): keys to be written to the input tensordict. diff --git a/torchrl/modules/models/model_based.py b/torchrl/modules/models/model_based.py index 60bc7d578ba..58d0362a118 100644 --- a/torchrl/modules/models/model_based.py +++ b/torchrl/modules/models/model_based.py @@ -41,9 +41,9 @@ class DreamerActor(nn.Module): Defaults to 200. activation_class (nn.Module, optional): Activation class. Defaults to nn.ELU. - std_bias (float, optional): Bias of the softplus transform. + std_bias (:obj:`float`, optional): Bias of the softplus transform. Defaults to 5.0. - std_min_val (float, optional): Minimum value of the standard deviation. + std_min_val (:obj:`float`, optional): Minimum value of the standard deviation. Defaults to 1e-4. """ @@ -282,7 +282,7 @@ class RSSMPrior(nn.Module): Defaults to 200. state_dim (int, optional): Size of the state. Defaults to 30. - scale_lb (float, optional): Lower bound of the scale of the state distribution. + scale_lb (:obj:`float`, optional): Lower bound of the scale of the state distribution. Defaults to 0.1. @@ -346,7 +346,7 @@ class RSSMPosterior(nn.Module): Defaults to 200. state_dim (int, optional): Size of the state. Defaults to 30. - scale_lb (float, optional): Lower bound of the scale of the state distribution. + scale_lb (:obj:`float`, optional): Lower bound of the scale of the state distribution. Defaults to 0.1. """ diff --git a/torchrl/modules/models/models.py b/torchrl/modules/models/models.py index 10884f8d415..49c0b0961ab 100644 --- a/torchrl/modules/models/models.py +++ b/torchrl/modules/models/models.py @@ -67,7 +67,7 @@ class or constructor to be used. norm_kwargs (dict or list of dicts, optional): kwargs to be used with the normalization layers. Aslo accepts a list of kwargs of length ``depth + int(activate_last_layer)``. - dropout (float, optional): dropout probability. Defaults to ``None`` (no + dropout (:obj:`float`, optional): dropout probability. Defaults to ``None`` (no dropout); bias_last_layer (bool): if ``True``, the last Linear layer will have a bias parameter. default: True; @@ -1059,7 +1059,7 @@ def ddpg_init_last_layer( Args: module (nn.Module): an actor or critic to be initialized. - scale (float, optional): the noise scale. Defaults to ``6e-4``. + scale (:obj:`float`, optional): the noise scale. Defaults to ``6e-4``. device (torch.device, optional): the device where the noise should be created. Defaults to the device of the last layer's weight parameter. diff --git a/torchrl/modules/tensordict_module/exploration.py b/torchrl/modules/tensordict_module/exploration.py index 433815f131c..df947236970 100644 --- a/torchrl/modules/tensordict_module/exploration.py +++ b/torchrl/modules/tensordict_module/exploration.py @@ -188,56 +188,7 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: class EGreedyWrapper(TensorDictModuleWrapper): - """[Deprecated] Epsilon-Greedy PO wrapper. - - Args: - policy (TensorDictModule): a deterministic policy. - - Keyword Args: - eps_init (scalar, optional): initial epsilon value. - default: 1.0 - eps_end (scalar, optional): final epsilon value. - default: 0.1 - annealing_num_steps (int, optional): number of steps it will take for epsilon to reach the eps_end value - action_key (NestedKey, optional): the key where the action can be found in the input tensordict. - Default is ``"action"``. - action_mask_key (NestedKey, optional): the key where the action mask can be found in the input tensordict. - Default is ``None`` (corresponding to no mask). - spec (TensorSpec, optional): if provided, the sampled action will be - taken from this action space. If not provided, - the exploration wrapper will attempt to recover it from the policy. - - .. note:: - Once a module has been wrapped in :class:`EGreedyWrapper`, it is - crucial to incorporate a call to :meth:`~.step` in the training loop - to update the exploration factor. - Since it is not easy to capture this omission no warning or exception - will be raised if this is ommitted! - - Examples: - >>> import torch - >>> from tensordict import TensorDict - >>> from torchrl.modules import EGreedyWrapper, Actor - >>> from torchrl.data import Bounded - >>> torch.manual_seed(0) - >>> spec = Bounded(-1, 1, torch.Size([4])) - >>> module = torch.nn.Linear(4, 4, bias=False) - >>> policy = Actor(spec=spec, module=module) - >>> explorative_policy = EGreedyWrapper(policy, eps_init=0.2) - >>> td = TensorDict({"observation": torch.zeros(10, 4)}, batch_size=[10]) - >>> print(explorative_policy(td).get("action")) - tensor([[ 0.0000, 0.0000, 0.0000, 0.0000], - [ 0.0000, 0.0000, 0.0000, 0.0000], - [ 0.9055, -0.9277, -0.6295, -0.2532], - [ 0.0000, 0.0000, 0.0000, 0.0000], - [ 0.0000, 0.0000, 0.0000, 0.0000], - [ 0.0000, 0.0000, 0.0000, 0.0000], - [ 0.0000, 0.0000, 0.0000, 0.0000], - [ 0.0000, 0.0000, 0.0000, 0.0000], - [ 0.0000, 0.0000, 0.0000, 0.0000], - [ 0.0000, 0.0000, 0.0000, 0.0000]], grad_fn=) - - """ + """[Deprecated] Epsilon-Greedy PO wrapper.""" def __init__( self, @@ -268,8 +219,8 @@ class AdditiveGaussianWrapper(TensorDictModuleWrapper): default: 0.1 annealing_num_steps (int, optional): number of steps it will take for sigma to reach the :obj:`sigma_end` value. - mean (float, optional): mean of each output element’s normal distribution. - std (float, optional): standard deviation of each output element’s normal distribution. + mean (:obj:`float`, optional): mean of each output element’s normal distribution. + std (:obj:`float`, optional): standard deviation of each output element’s normal distribution. action_key (NestedKey, optional): if the policy module has more than one output key, its output spec will be of type Composite. One needs to know where to find the action spec. @@ -408,9 +359,9 @@ class AdditiveGaussianModule(TensorDictModuleBase): annealing_num_steps (int, optional): number of steps it will take for sigma to reach the :obj:`sigma_end` value. default: 1000 - mean (float, optional): mean of each output element’s normal distribution. + mean (:obj:`float`, optional): mean of each output element’s normal distribution. default: 0.0 - std (float, optional): standard deviation of each output element’s normal distribution. + std (:obj:`float`, optional): standard deviation of each output element’s normal distribution. default: 1.0 Keyword Args: diff --git a/torchrl/objectives/a2c.py b/torchrl/objectives/a2c.py index d9472bdcde8..f324e491298 100644 --- a/torchrl/objectives/a2c.py +++ b/torchrl/objectives/a2c.py @@ -63,8 +63,8 @@ class A2CLoss(LossModule): ``samples_mc_entropy`` will control how many samples will be used to compute this estimate. Defaults to ``1``. - entropy_coef (float): the weight of the entropy loss. Defaults to `0.01``. - critic_coef (float): the weight of the critic loss. Defaults to ``1.0``. If ``None``, the critic + entropy_coef (:obj:`float`): the weight of the entropy loss. Defaults to `0.01``. + critic_coef (:obj:`float`): the weight of the critic loss. Defaults to ``1.0``. If ``None``, the critic loss won't be included and the in-keys will miss the critic inputs. loss_critic_type (str): loss function for the value discrepancy. Can be one of "l1", "l2" or "smooth_l1". Defaults to ``"smooth_l1"``. @@ -84,7 +84,7 @@ class A2CLoss(LossModule): ``"none"`` | ``"mean"`` | ``"sum"``. ``"none"``: no reduction will be applied, ``"mean"``: the sum of the output will be divided by the number of elements in the output, ``"sum"``: the output will be summed. Default: ``"mean"``. - clip_value (float, optional): If provided, it will be used to compute a clipped version of the value + clip_value (:obj:`float`, optional): If provided, it will be used to compute a clipped version of the value prediction with respect to the input value estimate and use it to calculate the value loss. The purpose of clipping is to limit the impact of extreme value predictions, helping stabilize training and preventing large updates. However, it will have no impact if the value estimate was done by the current diff --git a/torchrl/objectives/cql.py b/torchrl/objectives/cql.py index 55575ba2b6e..191096e7492 100644 --- a/torchrl/objectives/cql.py +++ b/torchrl/objectives/cql.py @@ -59,11 +59,11 @@ class CQLLoss(LossModule): Keyword args: loss_function (str, optional): loss function to be used with the value function loss. Default is `"smooth_l1"`. - alpha_init (float, optional): initial entropy multiplier. + alpha_init (:obj:`float`, optional): initial entropy multiplier. Default is 1.0. - min_alpha (float, optional): min value of alpha. + min_alpha (:obj:`float`, optional): min value of alpha. Default is None (no minimum value). - max_alpha (float, optional): max value of alpha. + max_alpha (:obj:`float`, optional): max value of alpha. Default is None (no maximum value). action_spec (TensorSpec, optional): the action tensor spec. If not provided and the target entropy is ``"auto"``, it will be retrieved from @@ -81,9 +81,9 @@ class CQLLoss(LossModule): delay_qvalue (bool, optional): Whether to separate the target Q value networks from the Q value networks used for data collection. Default is ``True``. - gamma (float, optional): Discount factor. Default is ``None``. - temperature (float, optional): CQL temperature. Default is 1.0. - min_q_weight (float, optional): Minimum Q weight. Default is 1.0. + gamma (:obj:`float`, optional): Discount factor. Default is ``None``. + temperature (:obj:`float`, optional): CQL temperature. Default is 1.0. + min_q_weight (:obj:`float`, optional): Minimum Q weight. Default is 1.0. max_q_backup (bool, optional): Whether to use the max-min Q backup. Default is ``False``. deterministic_backup (bool, optional): Whether to use the deterministic. Default is ``True``. @@ -91,7 +91,7 @@ class CQLLoss(LossModule): Default is 10. with_lagrange (bool, optional): Whether to use the Lagrange multiplier. Default is ``False``. - lagrange_thresh (float, optional): Lagrange threshold. Default is 0.0. + lagrange_thresh (:obj:`float`, optional): Lagrange threshold. Default is 0.0. reduction (str, optional): Specifies the reduction to apply to the output: ``"none"`` | ``"mean"`` | ``"sum"``. ``"none"``: no reduction will be applied, ``"mean"``: the sum of the output will be divided by the number of @@ -916,7 +916,7 @@ class DiscreteCQLLoss(LossModule): delay_value (bool): Whether to separate the target Q value networks from the Q value networks used for data collection. Default is ``True``. - gamma (float, optional): Discount factor. Default is ``None``. + gamma (:obj:`float`, optional): Discount factor. Default is ``None``. action_space: The action space of the environment. If None, it is inferred from the value network. Defaults to None. reduction (str, optional): Specifies the reduction to apply to the output: diff --git a/torchrl/objectives/crossq.py b/torchrl/objectives/crossq.py index eb1888fac11..ca6559ac5b8 100644 --- a/torchrl/objectives/crossq.py +++ b/torchrl/objectives/crossq.py @@ -67,11 +67,11 @@ class CrossQLoss(LossModule): Defaults to ``2``. loss_function (str, optional): loss function to be used with the value function loss. Default is `"smooth_l1"`. - alpha_init (float, optional): initial entropy multiplier. + alpha_init (:obj:`float`, optional): initial entropy multiplier. Default is 1.0. - min_alpha (float, optional): min value of alpha. + min_alpha (:obj:`float`, optional): min value of alpha. Default is None (no minimum value). - max_alpha (float, optional): max value of alpha. + max_alpha (:obj:`float`, optional): max value of alpha. Default is None (no maximum value). action_spec (TensorSpec, optional): the action tensor spec. If not provided and the target entropy is ``"auto"``, it will be retrieved from diff --git a/torchrl/objectives/decision_transformer.py b/torchrl/objectives/decision_transformer.py index b6783116ba1..1b1f0aa4e0b 100644 --- a/torchrl/objectives/decision_transformer.py +++ b/torchrl/objectives/decision_transformer.py @@ -29,11 +29,11 @@ class OnlineDTLoss(LossModule): actor_network (ProbabilisticActor): stochastic actor Keyword Args: - alpha_init (float, optional): initial entropy multiplier. + alpha_init (:obj:`float`, optional): initial entropy multiplier. Default is 1.0. - min_alpha (float, optional): min value of alpha. + min_alpha (:obj:`float`, optional): min value of alpha. Default is None (no minimum value). - max_alpha (float, optional): max value of alpha. + max_alpha (:obj:`float`, optional): max value of alpha. Default is None (no maximum value). fixed_alpha (bool, optional): if ``True``, alpha will be fixed to its initial value. Otherwise, alpha will be optimized to diff --git a/torchrl/objectives/deprecated.py b/torchrl/objectives/deprecated.py index 32394942600..2a4124c80de 100644 --- a/torchrl/objectives/deprecated.py +++ b/torchrl/objectives/deprecated.py @@ -58,11 +58,11 @@ class REDQLoss_deprecated(LossModule): loss_function (str, optional): loss function to be used for the Q-value. Can be one of ``"smooth_l1"``, ``"l2"``, ``"l1"``, Default is ``"smooth_l1"``. - alpha_init (float, optional): initial entropy multiplier. + alpha_init (:obj:`float`, optional): initial entropy multiplier. Default is ``1.0``. - min_alpha (float, optional): min value of alpha. + min_alpha (:obj:`float`, optional): min value of alpha. Default is ``0.1``. - max_alpha (float, optional): max value of alpha. + max_alpha (:obj:`float`, optional): max value of alpha. Default is ``10.0``. action_spec (TensorSpec, optional): the action tensor spec. If not provided and the target entropy is ``"auto"``, it will be retrieved from diff --git a/torchrl/objectives/dreamer.py b/torchrl/objectives/dreamer.py index 73df58b7e56..418ec95b677 100644 --- a/torchrl/objectives/dreamer.py +++ b/torchrl/objectives/dreamer.py @@ -39,9 +39,9 @@ class DreamerModelLoss(LossModule): Args: world_model (TensorDictModule): the world model. - lambda_kl (float, optional): the weight of the kl divergence loss. Default: 1.0. - lambda_reco (float, optional): the weight of the reconstruction loss. Default: 1.0. - lambda_reward (float, optional): the weight of the reward loss. Default: 1.0. + lambda_kl (:obj:`float`, optional): the weight of the kl divergence loss. Default: 1.0. + lambda_reco (:obj:`float`, optional): the weight of the reconstruction loss. Default: 1.0. + lambda_reward (:obj:`float`, optional): the weight of the reward loss. Default: 1.0. reco_loss (str, optional): the reconstruction loss. Default: "l2". reward_loss (str, optional): the reward loss. Default: "l2". free_nats (int, optional): the free nats. Default: 3. @@ -374,7 +374,7 @@ class DreamerValueLoss(LossModule): Default: ``"l2"``. discount_loss (bool, optional): if ``True``, the loss is discounted with a gamma discount factor. Default: False. - gamma (float, optional): the gamma discount factor. Default: ``0.99``. + gamma (:obj:`float`, optional): the gamma discount factor. Default: ``0.99``. """ diff --git a/torchrl/objectives/gail.py b/torchrl/objectives/gail.py index 3c0050fca84..bbac2581199 100644 --- a/torchrl/objectives/gail.py +++ b/torchrl/objectives/gail.py @@ -27,7 +27,7 @@ class GAILLoss(LossModule): Keyword Args: use_grad_penalty (bool, optional): Whether to use gradient penalty. Default: ``False``. - gp_lambda (float, optional): Gradient penalty lambda. Default: ``10``. + gp_lambda (:obj:`float`, optional): Gradient penalty lambda. Default: ``10``. reduction (str, optional): Specifies the reduction to apply to the output: ``"none"`` | ``"mean"`` | ``"sum"``. ``"none"``: no reduction will be applied, ``"mean"``: the sum of the output will be divided by the number of diff --git a/torchrl/objectives/iql.py b/torchrl/objectives/iql.py index 8e30019955b..88af015686b 100644 --- a/torchrl/objectives/iql.py +++ b/torchrl/objectives/iql.py @@ -52,11 +52,11 @@ class IQLLoss(LossModule): Defaults to ``2``. loss_function (str, optional): loss function to be used with the value function loss. Default is `"smooth_l1"`. - temperature (float, optional): Inverse temperature (beta). + temperature (:obj:`float`, optional): Inverse temperature (beta). For smaller hyperparameter values, the objective behaves similarly to behavioral cloning, while for larger values, it attempts to recover the maximum of the Q-function. - expectile (float, optional): expectile :math:`\tau`. A larger value of :math:`\tau` is crucial + expectile (:obj:`float`, optional): expectile :math:`\tau`. A larger value of :math:`\tau` is crucial for antmaze tasks that require dynamical programming ("stichting"). priority_key (str, optional): [Deprecated, use .set_keys(priority_key=priority_key) instead] tensordict key where to write the priority (for prioritized replay @@ -539,11 +539,11 @@ class DiscreteIQLLoss(IQLLoss): Defaults to ``2``. loss_function (str, optional): loss function to be used with the value function loss. Default is `"smooth_l1"`. - temperature (float, optional): Inverse temperature (beta). + temperature (:obj:`float`, optional): Inverse temperature (beta). For smaller hyperparameter values, the objective behaves similarly to behavioral cloning, while for larger values, it attempts to recover the maximum of the Q-function. - expectile (float, optional): expectile :math:`\tau`. A larger value of :math:`\tau` is crucial + expectile (:obj:`float`, optional): expectile :math:`\tau`. A larger value of :math:`\tau` is crucial for antmaze tasks that require dynamical programming ("stichting"). priority_key (str, optional): [Deprecated, use .set_keys(priority_key=priority_key) instead] tensordict key where to write the priority (for prioritized replay diff --git a/torchrl/objectives/ppo.py b/torchrl/objectives/ppo.py index ef78bc4bb0f..8c64c1ba539 100644 --- a/torchrl/objectives/ppo.py +++ b/torchrl/objectives/ppo.py @@ -106,7 +106,7 @@ class PPOLoss(LossModule): ``"none"`` | ``"mean"`` | ``"sum"``. ``"none"``: no reduction will be applied, ``"mean"``: the sum of the output will be divided by the number of elements in the output, ``"sum"``: the output will be summed. Default: ``"mean"``. - clip_value (float, optional): If provided, it will be used to compute a clipped version of the value + clip_value (:obj:`float`, optional): If provided, it will be used to compute a clipped version of the value prediction with respect to the input tensordict value estimate and use it to calculate the value loss. The purpose of clipping is to limit the impact of extreme value predictions, helping stabilize training and preventing large updates. However, it will have no impact if the value estimate was done by the current @@ -960,7 +960,7 @@ class KLPENPPOLoss(PPOLoss): ``"none"`` | ``"mean"`` | ``"sum"``. ``"none"``: no reduction will be applied, ``"mean"``: the sum of the output will be divided by the number of elements in the output, ``"sum"``: the output will be summed. Default: ``"mean"``. - clip_value (float, optional): If provided, it will be used to compute a clipped version of the value + clip_value (:obj:`float`, optional): If provided, it will be used to compute a clipped version of the value prediction with respect to the input tensordict value estimate and use it to calculate the value loss. The purpose of clipping is to limit the impact of extreme value predictions, helping stabilize training and preventing large updates. However, it will have no impact if the value estimate was done by the current diff --git a/torchrl/objectives/redq.py b/torchrl/objectives/redq.py index 271f233bae8..e234df1a512 100644 --- a/torchrl/objectives/redq.py +++ b/torchrl/objectives/redq.py @@ -59,11 +59,11 @@ class REDQLoss(LossModule): loss_function (str, optional): loss function to be used for the Q-value. Can be one of ``"smooth_l1"``, ``"l2"``, ``"l1"``, Default is ``"smooth_l1"``. - alpha_init (float, optional): initial entropy multiplier. + alpha_init (:obj:`float`, optional): initial entropy multiplier. Default is ``1.0``. - min_alpha (float, optional): min value of alpha. + min_alpha (:obj:`float`, optional): min value of alpha. Default is ``0.1``. - max_alpha (float, optional): max value of alpha. + max_alpha (:obj:`float`, optional): max value of alpha. Default is ``10.0``. action_spec (TensorSpec, optional): the action tensor spec. If not provided and the target entropy is ``"auto"``, it will be retrieved from diff --git a/torchrl/objectives/reinforce.py b/torchrl/objectives/reinforce.py index 08ff896610c..2f207339b2f 100644 --- a/torchrl/objectives/reinforce.py +++ b/torchrl/objectives/reinforce.py @@ -66,7 +66,7 @@ class ReinforceLoss(LossModule): ``"none"`` | ``"mean"`` | ``"sum"``. ``"none"``: no reduction will be applied, ``"mean"``: the sum of the output will be divided by the number of elements in the output, ``"sum"``: the output will be summed. Default: ``"mean"``. - clip_value (float, optional): If provided, it will be used to compute a clipped version of the value + clip_value (:obj:`float`, optional): If provided, it will be used to compute a clipped version of the value prediction with respect to the input tensordict value estimate and use it to calculate the value loss. The purpose of clipping is to limit the impact of extreme value predictions, helping stabilize training and preventing large updates. However, it will have no impact if the value estimate was done by the current diff --git a/torchrl/objectives/sac.py b/torchrl/objectives/sac.py index f37b0fba6f0..5cd89f1fa10 100644 --- a/torchrl/objectives/sac.py +++ b/torchrl/objectives/sac.py @@ -90,11 +90,11 @@ class SACLoss(LossModule): Defaults to ``2``. loss_function (str, optional): loss function to be used with the value function loss. Default is `"smooth_l1"`. - alpha_init (float, optional): initial entropy multiplier. + alpha_init (:obj:`float`, optional): initial entropy multiplier. Default is 1.0. - min_alpha (float, optional): min value of alpha. + min_alpha (:obj:`float`, optional): min value of alpha. Default is None (no minimum value). - max_alpha (float, optional): max value of alpha. + max_alpha (:obj:`float`, optional): max value of alpha. Default is None (no maximum value). action_spec (TensorSpec, optional): the action tensor spec. If not provided and the target entropy is ``"auto"``, it will be retrieved from @@ -831,14 +831,14 @@ class DiscreteSACLoss(LossModule): num_qvalue_nets (int, optional): Number of Q-value networks to be trained. Default is 2. loss_function (str, optional): loss function to be used for the Q-value. Can be one of `"smooth_l1"`, "l2", "l1", Default is "smooth_l1". - alpha_init (float, optional): initial entropy multiplier. + alpha_init (:obj:`float`, optional): initial entropy multiplier. Default is 1.0. - min_alpha (float, optional): min value of alpha. + min_alpha (:obj:`float`, optional): min value of alpha. Default is None (no minimum value). - max_alpha (float, optional): max value of alpha. + max_alpha (:obj:`float`, optional): max value of alpha. Default is None (no maximum value). fixed_alpha (bool, optional): whether alpha should be trained to match a target entropy. Default is ``False``. - target_entropy_weight (float, optional): weight for the target entropy term. + target_entropy_weight (:obj:`float`, optional): weight for the target entropy term. target_entropy (Union[str, Number], optional): Target entropy for the stochastic policy. Default is "auto". delay_qvalue (bool, optional): Whether to separate the target Q value networks from the Q value networks used for data collection. Default is ``False``. @@ -855,51 +855,51 @@ class DiscreteSACLoss(LossModule): elements in the output, ``"sum"``: the output will be summed. Default: ``"mean"``. Examples: - >>> import torch - >>> from torch import nn - >>> from torchrl.data.tensor_specs import OneHot - >>> from torchrl.modules.distributions import NormalParamExtractor, OneHotCategorical - >>> from torchrl.modules.tensordict_module.actors import ProbabilisticActor, ValueOperator - >>> from torchrl.modules.tensordict_module.common import SafeModule - >>> from torchrl.objectives.sac import DiscreteSACLoss - >>> from tensordict import TensorDict - >>> from tensordict.nn import TensorDictModule - >>> n_act, n_obs = 4, 3 - >>> spec = OneHot(n_act) - >>> module = TensorDictModule(nn.Linear(n_obs, n_act), in_keys=["observation"], out_keys=["logits"]) - >>> actor = ProbabilisticActor( - ... module=module, - ... in_keys=["logits"], - ... out_keys=["action"], - ... spec=spec, - ... distribution_class=OneHotCategorical) - >>> qvalue = TensorDictModule( - ... nn.Linear(n_obs, n_act), - ... in_keys=["observation"], - ... out_keys=["action_value"], - ... ) - >>> loss = DiscreteSACLoss(actor, qvalue, action_space=spec, num_actions=spec.space.n) - >>> batch = [2,] - >>> action = spec.rand(batch) - >>> data = TensorDict({ - ... "observation": torch.randn(*batch, n_obs), - ... "action": action, - ... ("next", "done"): torch.zeros(*batch, 1, dtype=torch.bool), - ... ("next", "terminated"): torch.zeros(*batch, 1, dtype=torch.bool), - ... ("next", "reward"): torch.randn(*batch, 1), - ... ("next", "observation"): torch.randn(*batch, n_obs), - ... }, batch) - >>> loss(data) - TensorDict( - fields={ - alpha: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False), - entropy: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False), - loss_actor: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False), - loss_alpha: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False), - loss_qvalue: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False)}, - batch_size=torch.Size([]), - device=None, - is_shared=False) + >>> import torch + >>> from torch import nn + >>> from torchrl.data.tensor_specs import OneHot + >>> from torchrl.modules.distributions import NormalParamExtractor, OneHotCategorical + >>> from torchrl.modules.tensordict_module.actors import ProbabilisticActor, ValueOperator + >>> from torchrl.modules.tensordict_module.common import SafeModule + >>> from torchrl.objectives.sac import DiscreteSACLoss + >>> from tensordict import TensorDict + >>> from tensordict.nn import TensorDictModule + >>> n_act, n_obs = 4, 3 + >>> spec = OneHot(n_act) + >>> module = TensorDictModule(nn.Linear(n_obs, n_act), in_keys=["observation"], out_keys=["logits"]) + >>> actor = ProbabilisticActor( + ... module=module, + ... in_keys=["logits"], + ... out_keys=["action"], + ... spec=spec, + ... distribution_class=OneHotCategorical) + >>> qvalue = TensorDictModule( + ... nn.Linear(n_obs, n_act), + ... in_keys=["observation"], + ... out_keys=["action_value"], + ... ) + >>> loss = DiscreteSACLoss(actor, qvalue, action_space=spec, num_actions=spec.space.n) + >>> batch = [2,] + >>> action = spec.rand(batch) + >>> data = TensorDict({ + ... "observation": torch.randn(*batch, n_obs), + ... "action": action, + ... ("next", "done"): torch.zeros(*batch, 1, dtype=torch.bool), + ... ("next", "terminated"): torch.zeros(*batch, 1, dtype=torch.bool), + ... ("next", "reward"): torch.randn(*batch, 1), + ... ("next", "observation"): torch.randn(*batch, n_obs), + ... }, batch) + >>> loss(data) + TensorDict( + fields={ + alpha: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False), + entropy: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False), + loss_actor: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False), + loss_alpha: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False), + loss_qvalue: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False)}, + batch_size=torch.Size([]), + device=None, + is_shared=False) This class is compatible with non-tensordict based modules too and can be diff --git a/torchrl/objectives/td3.py b/torchrl/objectives/td3.py index 89ff581991f..42e04ec2212 100644 --- a/torchrl/objectives/td3.py +++ b/torchrl/objectives/td3.py @@ -52,9 +52,9 @@ class TD3Loss(LossModule): Exclusive with bounds. Either this or ``bounds`` must be provided. num_qvalue_nets (int, optional): Number of Q-value networks to be trained. Default is ``10``. - policy_noise (float, optional): Standard deviation for the target + policy_noise (:obj:`float`, optional): Standard deviation for the target policy action noise. Default is ``0.2``. - noise_clip (float, optional): Clipping range value for the sampled + noise_clip (:obj:`float`, optional): Clipping range value for the sampled target policy action noise. Default is ``0.5``. priority_key (str, optional): Key where to write the priority value for prioritized replay buffers. Default is diff --git a/torchrl/objectives/td3_bc.py b/torchrl/objectives/td3_bc.py index 8b394137480..998fbde6ea4 100644 --- a/torchrl/objectives/td3_bc.py +++ b/torchrl/objectives/td3_bc.py @@ -61,11 +61,11 @@ class TD3BCLoss(LossModule): Exclusive with ``bounds``. Either this or ``bounds`` must be provided. num_qvalue_nets (int, optional): Number of Q-value networks to be trained. Default is ``2``. - policy_noise (float, optional): Standard deviation for the target + policy_noise (:obj:`float`, optional): Standard deviation for the target policy action noise. Default is ``0.2``. - noise_clip (float, optional): Clipping range value for the sampled + noise_clip (:obj:`float`, optional): Clipping range value for the sampled target policy action noise. Default is ``0.5``. - alpha (float, optional): Weight for the behavioral cloning loss. + alpha (:obj:`float`, optional): Weight for the behavioral cloning loss. Defaults to ``2.5``. priority_key (str, optional): Key where to write the priority value for prioritized replay buffers. Default is diff --git a/torchrl/objectives/utils.py b/torchrl/objectives/utils.py index 972fd200e0e..b0ba254d2b3 100644 --- a/torchrl/objectives/utils.py +++ b/torchrl/objectives/utils.py @@ -463,7 +463,7 @@ def next_state_value( key-value in the input tensordict when called. It does not need to be provided if pred_next_val is given. next_val_key (str, optional): key where the next value will be written. Default: 'state_action_value' - gamma (float, optional): return discount rate. + gamma (:obj:`float`, optional): return discount rate. default: 0.99 pred_next_val (Tensor, optional): the next state value can be provided if it is not computed with the operator. diff --git a/torchrl/objectives/value/functional.py b/torchrl/objectives/value/functional.py index bb737d7c20d..15e5d56d6bf 100644 --- a/torchrl/objectives/value/functional.py +++ b/torchrl/objectives/value/functional.py @@ -233,7 +233,7 @@ def _fast_vec_gae( terminated (torch.Tensor): a [B, T] boolean tensor containing the terminated states. gamma (scalar): the gamma decay (trajectory discount) lmbda (scalar): the lambda decay (exponential mean discount) - thr (float): threshold for the filter. Below this limit, components will ignored. + thr (:obj:`float`): threshold for the filter. Below this limit, components will ignored. Defaults to 1e-7. All tensors (values, reward and done) must have shape @@ -482,26 +482,30 @@ def td1_return_estimate( terminated (Tensor): boolean flag for the end of episode. Defaults to ``done`` if not provided. rolling_gamma (bool, optional): if ``True``, it is assumed that each gamma - if a gamma tensor is tied to a single event: - gamma = [g1, g2, g3, g4] - value = [v1, v2, v3, v4] - return = [ - v1 + g1 v2 + g1 g2 v3 + g1 g2 g3 v4, - v2 + g2 v3 + g2 g3 v4, - v3 + g3 v4, - v4, - ] - if False, it is assumed that each gamma is tied to the upcoming + of a gamma tensor is tied to a single event: + + >>> gamma = [g1, g2, g3, g4] + >>> value = [v1, v2, v3, v4] + >>> return = [ + ... v1 + g1 v2 + g1 g2 v3 + g1 g2 g3 v4, + ... v2 + g2 v3 + g2 g3 v4, + ... v3 + g3 v4, + ... v4, + ... ] + + if ``False``, it is assumed that each gamma is tied to the upcoming trajectory: - gamma = [g1, g2, g3, g4] - value = [v1, v2, v3, v4] - return = [ - v1 + g1 v2 + g1**2 v3 + g**3 v4, - v2 + g2 v3 + g2**2 v4, - v3 + g3 v4, - v4, - ] - Default is True. + + >>> gamma = [g1, g2, g3, g4] + >>> value = [v1, v2, v3, v4] + >>> return = [ + ... v1 + g1 v2 + g1**2 v3 + g**3 v4, + ... v2 + g2 v3 + g2**2 v4, + ... v3 + g3 v4, + ... v4, + ... ] + + Default is ``True``. time_dim (int): dimension where the time is unrolled. Defaults to -2. All tensors (values, reward and done) must have shape @@ -579,26 +583,30 @@ def td1_advantage_estimate( terminated (Tensor): boolean flag for the end of episode. Defaults to ``done`` if not provided. rolling_gamma (bool, optional): if ``True``, it is assumed that each gamma - if a gamma tensor is tied to a single event: - gamma = [g1, g2, g3, g4] - value = [v1, v2, v3, v4] - return = [ - v1 + g1 v2 + g1 g2 v3 + g1 g2 g3 v4, - v2 + g2 v3 + g2 g3 v4, - v3 + g3 v4, - v4, - ] - if False, it is assumed that each gamma is tied to the upcoming + of a gamma tensor is tied to a single event: + + >>> gamma = [g1, g2, g3, g4] + >>> value = [v1, v2, v3, v4] + >>> return = [ + ... v1 + g1 v2 + g1 g2 v3 + g1 g2 g3 v4, + ... v2 + g2 v3 + g2 g3 v4, + ... v3 + g3 v4, + ... v4, + ... ] + + if ``False``, it is assumed that each gamma is tied to the upcoming trajectory: - gamma = [g1, g2, g3, g4] - value = [v1, v2, v3, v4] - return = [ - v1 + g1 v2 + g1**2 v3 + g**3 v4, - v2 + g2 v3 + g2**2 v4, - v3 + g3 v4, - v4, - ] - Default is True. + + >>> gamma = [g1, g2, g3, g4] + >>> value = [v1, v2, v3, v4] + >>> return = [ + ... v1 + g1 v2 + g1**2 v3 + g**3 v4, + ... v2 + g2 v3 + g2**2 v4, + ... v3 + g3 v4, + ... v4, + ... ] + + Default is ``True``. time_dim (int): dimension where the time is unrolled. Defaults to -2. All tensors (values, reward and done) must have shape @@ -650,27 +658,31 @@ def vec_td1_return_estimate( terminated (Tensor): boolean flag for the end of episode. Defaults to ``done`` if not provided. rolling_gamma (bool, optional): if ``True``, it is assumed that each gamma - if a gamma tensor is tied to a single event: - gamma = [g1, g2, g3, g4] - value = [v1, v2, v3, v4] - return = [ - v1 + g1 v2 + g1 g2 v3 + g1 g2 g3 v4, - v2 + g2 v3 + g2 g3 v4, - v3 + g3 v4, - v4, - ] - if False, it is assumed that each gamma is tied to the upcoming + of the gamma tensor is tied to a single event: + + >>> gamma = [g1, g2, g3, g4] + >>> value = [v1, v2, v3, v4] + >>> return = [ + ... v1 + g1 v2 + g1 g2 v3 + g1 g2 g3 v4, + ... v2 + g2 v3 + g2 g3 v4, + ... v3 + g3 v4, + ... v4, + ... ] + + if ``False``, it is assumed that each gamma is tied to the upcoming trajectory: - gamma = [g1, g2, g3, g4] - value = [v1, v2, v3, v4] - return = [ - v1 + g1 v2 + g1**2 v3 + g**3 v4, - v2 + g2 v3 + g2**2 v4, - v3 + g3 v4, - v4, - ] - Default is True. - time_dim (int): dimension where the time is unrolled. Defaults to -2. + + >>> gamma = [g1, g2, g3, g4] + >>> value = [v1, v2, v3, v4] + >>> return = [ + ... v1 + g1 v2 + g1**2 v3 + g**3 v4, + ... v2 + g2 v3 + g2**2 v4, + ... v3 + g3 v4, + ... v4, + ... ] + + Default is ``True``. + time_dim (int): dimension where the time is unrolled. Defaults to ``-2``. All tensors (values, reward and done) must have shape ``[*Batch x TimeSteps x *F]``, with ``*F`` feature dimensions. @@ -709,26 +721,30 @@ def vec_td1_advantage_estimate( terminated (Tensor): boolean flag for the end of episode. Defaults to ``done`` if not provided. rolling_gamma (bool, optional): if ``True``, it is assumed that each gamma - if a gamma tensor is tied to a single event: - gamma = [g1, g2, g3, g4] - value = [v1, v2, v3, v4] - return = [ - v1 + g1 v2 + g1 g2 v3 + g1 g2 g3 v4, - v2 + g2 v3 + g2 g3 v4, - v3 + g3 v4, - v4, - ] - if False, it is assumed that each gamma is tied to the upcoming + of a gamma tensor is tied to a single event: + + >>> gamma = [g1, g2, g3, g4] + >>> value = [v1, v2, v3, v4] + >>> return = [ + ... v1 + g1 v2 + g1 g2 v3 + g1 g2 g3 v4, + ... v2 + g2 v3 + g2 g3 v4, + ... v3 + g3 v4, + ... v4, + ... ] + + if ``False``, it is assumed that each gamma is tied to the upcoming trajectory: - gamma = [g1, g2, g3, g4] - value = [v1, v2, v3, v4] - return = [ - v1 + g1 v2 + g1**2 v3 + g**3 v4, - v2 + g2 v3 + g2**2 v4, - v3 + g3 v4, - v4, - ] - Default is True. + + >>> gamma = [g1, g2, g3, g4] + >>> value = [v1, v2, v3, v4] + >>> return = [ + ... v1 + g1 v2 + g1**2 v3 + g**3 v4, + ... v2 + g2 v3 + g2**2 v4, + ... v3 + g3 v4, + ... v4, + ... ] + + Default is ``True``. time_dim (int): dimension where the time is unrolled. Defaults to -2. All tensors (values, reward and done) must have shape @@ -787,26 +803,30 @@ def td_lambda_return_estimate( terminated (Tensor): boolean flag for the end of episode. Defaults to ``done`` if not provided. rolling_gamma (bool, optional): if ``True``, it is assumed that each gamma - if a gamma tensor is tied to a single event: - gamma = [g1, g2, g3, g4] - value = [v1, v2, v3, v4] - return = [ - v1 + g1 v2 + g1 g2 v3 + g1 g2 g3 v4, - v2 + g2 v3 + g2 g3 v4, - v3 + g3 v4, - v4, - ] - if False, it is assumed that each gamma is tied to the upcoming + of a gamma tensor is tied to a single event: + + >>> gamma = [g1, g2, g3, g4] + >>> value = [v1, v2, v3, v4] + >>> return = [ + ... v1 + g1 v2 + g1 g2 v3 + g1 g2 g3 v4, + ... v2 + g2 v3 + g2 g3 v4, + ... v3 + g3 v4, + ... v4, + ... ] + + if ``False``, it is assumed that each gamma is tied to the upcoming trajectory: - gamma = [g1, g2, g3, g4] - value = [v1, v2, v3, v4] - return = [ - v1 + g1 v2 + g1**2 v3 + g**3 v4, - v2 + g2 v3 + g2**2 v4, - v3 + g3 v4, - v4, - ] - Default is True. + + >>> gamma = [g1, g2, g3, g4] + >>> value = [v1, v2, v3, v4] + >>> return = [ + ... v1 + g1 v2 + g1**2 v3 + g**3 v4, + ... v2 + g2 v3 + g2**2 v4, + ... v3 + g3 v4, + ... v4, + ... ] + + Default is ``True``. time_dim (int): dimension where the time is unrolled. Defaults to -2. All tensors (values, reward and done) must have shape @@ -893,26 +913,30 @@ def td_lambda_advantage_estimate( terminated (Tensor): boolean flag for the end of episode. Defaults to ``done`` if not provided. rolling_gamma (bool, optional): if ``True``, it is assumed that each gamma - if a gamma tensor is tied to a single event: - gamma = [g1, g2, g3, g4] - value = [v1, v2, v3, v4] - return = [ - v1 + g1 v2 + g1 g2 v3 + g1 g2 g3 v4, - v2 + g2 v3 + g2 g3 v4, - v3 + g3 v4, - v4, - ] - if False, it is assumed that each gamma is tied to the upcoming + of a gamma tensor is tied to a single event: + + >>> gamma = [g1, g2, g3, g4] + >>> value = [v1, v2, v3, v4] + >>> return = [ + ... v1 + g1 v2 + g1 g2 v3 + g1 g2 g3 v4, + ... v2 + g2 v3 + g2 g3 v4, + ... v3 + g3 v4, + ... v4, + ... ] + + if ``False``, it is assumed that each gamma is tied to the upcoming trajectory: - gamma = [g1, g2, g3, g4] - value = [v1, v2, v3, v4] - return = [ - v1 + g1 v2 + g1**2 v3 + g**3 v4, - v2 + g2 v3 + g2**2 v4, - v3 + g3 v4, - v4, - ] - Default is True. + + >>> gamma = [g1, g2, g3, g4] + >>> value = [v1, v2, v3, v4] + >>> return = [ + ... v1 + g1 v2 + g1**2 v3 + g**3 v4, + ... v2 + g2 v3 + g2**2 v4, + ... v3 + g3 v4, + ... v4, + ... ] + + Default is ``True``. time_dim (int): dimension where the time is unrolled. Defaults to -2. All tensors (values, reward and done) must have shape @@ -966,7 +990,7 @@ def _fast_td_lambda_return_estimate( reward (torch.Tensor): a [*B, T, F] tensor containing rewards done (Tensor): boolean flag for end of trajectory. terminated (Tensor): boolean flag for end of episode. - thr (float): threshold for the filter. Below this limit, components will ignored. + thr (:obj:`float`): threshold for the filter. Below this limit, components will ignored. Defaults to 1e-7. All tensors (values, reward and done) must have shape @@ -1029,26 +1053,30 @@ def vec_td_lambda_return_estimate( terminated (Tensor): boolean flag for the end of episode. Defaults to ``done`` if not provided. rolling_gamma (bool, optional): if ``True``, it is assumed that each gamma - if a gamma tensor is tied to a single event: - gamma = [g1, g2, g3, g4] - value = [v1, v2, v3, v4] - return = [ - v1 + g1 v2 + g1 g2 v3 + g1 g2 g3 v4, - v2 + g2 v3 + g2 g3 v4, - v3 + g3 v4, - v4, - ] - if False, it is assumed that each gamma is tied to the upcoming + of a gamma tensor is tied to a single event: + + >>> gamma = [g1, g2, g3, g4] + >>> value = [v1, v2, v3, v4] + >>> return = [ + ... v1 + g1 v2 + g1 g2 v3 + g1 g2 g3 v4, + ... v2 + g2 v3 + g2 g3 v4, + ... v3 + g3 v4, + ... v4, + ... ] + + if ``False``, it is assumed that each gamma is tied to the upcoming trajectory: - gamma = [g1, g2, g3, g4] - value = [v1, v2, v3, v4] - return = [ - v1 + g1 v2 + g1**2 v3 + g**3 v4, - v2 + g2 v3 + g2**2 v4, - v3 + g3 v4, - v4, - ] - Default is True. + + >>> gamma = [g1, g2, g3, g4] + >>> value = [v1, v2, v3, v4] + >>> return = [ + ... v1 + g1 v2 + g1**2 v3 + g**3 v4, + ... v2 + g2 v3 + g2**2 v4, + ... v3 + g3 v4, + ... v4, + ... ] + + Default is ``True``. time_dim (int): dimension where the time is unrolled. Defaults to -2. All tensors (values, reward and done) must have shape @@ -1179,26 +1207,30 @@ def vec_td_lambda_advantage_estimate( terminated (Tensor): boolean flag for the end of episode. Defaults to ``done`` if not provided. rolling_gamma (bool, optional): if ``True``, it is assumed that each gamma - if a gamma tensor is tied to a single event: - gamma = [g1, g2, g3, g4] - value = [v1, v2, v3, v4] - return = [ - v1 + g1 v2 + g1 g2 v3 + g1 g2 g3 v4, - v2 + g2 v3 + g2 g3 v4, - v3 + g3 v4, - v4, - ] - if False, it is assumed that each gamma is tied to the upcoming + of a gamma tensor is tied to a single event: + + >>> gamma = [g1, g2, g3, g4] + >>> value = [v1, v2, v3, v4] + >>> return = [ + ... v1 + g1 v2 + g1 g2 v3 + g1 g2 g3 v4, + ... v2 + g2 v3 + g2 g3 v4, + ... v3 + g3 v4, + ... v4, + ... ] + + if ``False``, it is assumed that each gamma is tied to the upcoming trajectory: - gamma = [g1, g2, g3, g4] - value = [v1, v2, v3, v4] - return = [ - v1 + g1 v2 + g1**2 v3 + g**3 v4, - v2 + g2 v3 + g2**2 v4, - v3 + g3 v4, - v4, - ] - Default is True. + + >>> gamma = [g1, g2, g3, g4] + >>> value = [v1, v2, v3, v4] + >>> return = [ + ... v1 + g1 v2 + g1**2 v3 + g**3 v4, + ... v2 + g2 v3 + g2**2 v4, + ... v3 + g3 v4, + ... v4, + ... ] + + Default is ``True``. time_dim (int): dimension where the time is unrolled. Defaults to -2. All tensors (values, reward and done) must have shape @@ -1338,7 +1370,7 @@ def reward2go( received at each time step over multiple trajectories. done (Tensor): boolean flag for end of episode. Differs from truncated, where the episode did not end but was interrupted. - gamma (float, optional): The discount factor to use for computing the + gamma (:obj:`float`, optional): The discount factor to use for computing the discounted cumulative sum of rewards. Defaults to 1.0. time_dim (int): dimension where the time is unrolled. Defaults to -2. diff --git a/torchrl/record/loggers/csv.py b/torchrl/record/loggers/csv.py index 4de3de2b928..a2295bc116a 100644 --- a/torchrl/record/loggers/csv.py +++ b/torchrl/record/loggers/csv.py @@ -170,7 +170,7 @@ def log_scalar(self, name: str, value: float, step: int = None) -> None: Args: name (str): The name of the scalar. - value (float): The value of the scalar. + value (:obj:`float`): The value of the scalar. step (int, optional): The step at which the scalar is logged. Defaults to None. """ self.experiment.add_scalar(name, value, global_step=step) diff --git a/torchrl/record/loggers/mlflow.py b/torchrl/record/loggers/mlflow.py index 5e6256a6d44..304b9b3dbe0 100644 --- a/torchrl/record/loggers/mlflow.py +++ b/torchrl/record/loggers/mlflow.py @@ -68,7 +68,7 @@ def log_scalar(self, name: str, value: float, step: Optional[int] = None) -> Non Args: name (str): The name of the scalar. - value (float): The value of the scalar. + value (:obj:`float`): The value of the scalar. step (int, optional): The step at which the scalar is logged. Defaults to None. """ diff --git a/torchrl/record/loggers/tensorboard.py b/torchrl/record/loggers/tensorboard.py index 7327e4aff33..5ecc9742614 100644 --- a/torchrl/record/loggers/tensorboard.py +++ b/torchrl/record/loggers/tensorboard.py @@ -54,7 +54,7 @@ def log_scalar(self, name: str, value: float, step: int = None) -> None: Args: name (str): The name of the scalar. - value (float): The value of the scalar. + value (:obj:`float`): The value of the scalar. step (int, optional): The step at which the scalar is logged. Defaults to None. """ @@ -122,7 +122,7 @@ def log_histogram(self, name: str, data: Sequence, **kwargs): Keyword Args: step (int): Global step value to record bins (str): One of {‘tensorflow’,’auto’, ‘fd’, …}. This determines how the bins are made. You can find other options in: https://docs.scipy.org/doc/numpy/reference/generated/numpy.histogram.html - walltime (float): Optional override default walltime (time.time()) seconds after epoch of event + walltime (:obj:`float`): Optional override default walltime (time.time()) seconds after epoch of event """ global_step = kwargs.pop("step", None) diff --git a/torchrl/record/loggers/wandb.py b/torchrl/record/loggers/wandb.py index 441360a48bb..f0048648f86 100644 --- a/torchrl/record/loggers/wandb.py +++ b/torchrl/record/loggers/wandb.py @@ -110,7 +110,7 @@ def log_scalar(self, name: str, value: float, step: Optional[int] = None) -> Non Args: name (str): The name of the scalar. - value (float): The value of the scalar. + value (:obj:`float`): The value of the scalar. step (int, optional): The step at which the scalar is logged. Defaults to None. """ diff --git a/torchrl/trainers/trainers.py b/torchrl/trainers/trainers.py index 65247849639..7e28da45f52 100644 --- a/torchrl/trainers/trainers.py +++ b/torchrl/trainers/trainers.py @@ -874,11 +874,11 @@ class RewardNormalizer(TrainerHookBase): """Reward normalizer hook. Args: - decay (float, optional): exponential moving average decay parameter. + decay (:obj:`float`, optional): exponential moving average decay parameter. Default is 0.999 - scale (float, optional): the scale used to multiply the reward once + scale (:obj:`float`, optional): the scale used to multiply the reward once normalized. Defaults to 1.0. - eps (float, optional): the epsilon jitter used to prevent numerical + eps (:obj:`float`, optional): the epsilon jitter used to prevent numerical underflow. Defaults to ``torch.finfo(DEFAULT_DTYPE).eps`` where ``DEFAULT_DTYPE=torch.get_default_dtype()``. reward_key (str or tuple, optional): the key where to find the reward diff --git a/tutorials/sphinx-tutorials/coding_ddpg.py b/tutorials/sphinx-tutorials/coding_ddpg.py index 13721b715e3..906d162f181 100644 --- a/tutorials/sphinx-tutorials/coding_ddpg.py +++ b/tutorials/sphinx-tutorials/coding_ddpg.py @@ -117,6 +117,8 @@ # method will receive a TensorDict as input that contains all the necessary # information to return a loss value. # +# .. code-block::Python +# # >>> data = replay_buffer.sample() # >>> loss_dict = loss_module(data) # @@ -130,6 +132,8 @@ # optimizer for different sets of parameters for instance. Summing the losses # can be simply done via # +# ..code - block::Python +# # >>> loss_val = sum(loss for key, loss in loss_dict.items() if key.startswith("loss_")) # # The ``__init__`` method diff --git a/tutorials/sphinx-tutorials/getting-started-2.py b/tutorials/sphinx-tutorials/getting-started-2.py index 84fefc8197a..d1917d88129 100644 --- a/tutorials/sphinx-tutorials/getting-started-2.py +++ b/tutorials/sphinx-tutorials/getting-started-2.py @@ -25,12 +25,14 @@ # # The typical training loop therefore looks like this: # -# >>> for i in range(n_collections): -# ... data = get_next_batch(env, policy) -# ... for j in range(n_optim): -# ... loss = loss_fn(data) -# ... loss.backward() -# ... optim.step() +# ..code - block::Python +# +# >>> for i in range(n_collections): +# ... data = get_next_batch(env, policy) +# ... for j in range(n_optim): +# ... loss = loss_fn(data) +# ... loss.backward() +# ... optim.step() # # In this concise tutorial, you will receive a brief overview of the loss modules. Due to the typically # straightforward nature of the API for basic usage, this tutorial will be kept brief. diff --git a/tutorials/sphinx-tutorials/getting-started-3.py b/tutorials/sphinx-tutorials/getting-started-3.py index 3bd5c6ea5c3..594cb7392c0 100644 --- a/tutorials/sphinx-tutorials/getting-started-3.py +++ b/tutorials/sphinx-tutorials/getting-started-3.py @@ -98,8 +98,10 @@ # the environment (the ``total_frames`` argument in the collector). # For this reason, most training loops in our examples look like this: # -# >>> for data in collector: -# ... # your algorithm here +# ..code - block::Python +# +# >>> for data in collector: +# ... # your algorithm here # # # Replay Buffers @@ -112,6 +114,8 @@ # temporarily and cleared after a little while given some heuristic: # first-in first-out or other. A typical pseudo-code would look like this: # +# ..code - block::Python +# # >>> for data in collector: # ... storage.store(data) # ... for i in range(n_optim): diff --git a/tutorials/sphinx-tutorials/rb_tutorial.py b/tutorials/sphinx-tutorials/rb_tutorial.py index f189888b804..4f5ecb4936d 100644 --- a/tutorials/sphinx-tutorials/rb_tutorial.py +++ b/tutorials/sphinx-tutorials/rb_tutorial.py @@ -566,6 +566,8 @@ def assert0(x): # tensordict passed to the loss module, making it possible to update the # weights without effort: # +# ..code - block::Python +# # >>> data = replay_buffer.sample() # >>> loss_val = loss_module(data) # >>> replay_buffer.update_tensordict_priority(data) @@ -817,7 +819,7 @@ def assert0(x): # compatible with tensordict-structured data): the number of slices or their # length and some information about where the separation between the # episodes can be found (e.g. :ref:`recall that ` with a -# :ref:`DataCollector `, the trajectory id is stored in +# :ref:`DataCollector `, the trajectory id is stored in # ``("collector", "traj_ids")``). In this simple example, we construct a data # with 4 consecutive short trajectories and sample 4 slices out of it, each of # length 2 (since the batch size is 8, and 8 items // 4 slices = 2 time steps). diff --git a/tutorials/sphinx-tutorials/torchrl_demo.py b/tutorials/sphinx-tutorials/torchrl_demo.py index e9ddbdf9048..548fffbc6a5 100644 --- a/tutorials/sphinx-tutorials/torchrl_demo.py +++ b/tutorials/sphinx-tutorials/torchrl_demo.py @@ -757,8 +757,8 @@ def exec_sequence(params, data): # We use a :class:`~torchrl.envs.SerialEnv` for simplicity (single worker), but for larger jobs a # :class:`~torchrl.envs.ParallelEnv` (multi-workers) would be better suited. # -# .. note:: -# Multiprocessed envs and multiprocessed collectors can be combined! +# .. note:: Multiprocessed envs and multiprocessed collectors can be combined! +# parallel_env = SerialEnv( 3, @@ -771,7 +771,7 @@ def exec_sequence(params, data): ############################################################################### # Sync multiprocessed data collector -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # devices = ["cpu", "cpu"]