From 3e586b78535fca66be73b6ab380aceb190257161 Mon Sep 17 00:00:00 2001 From: Zuxin Date: Wed, 14 Jun 2023 20:30:22 -0700 Subject: [PATCH] Update configs, scripts, and instructions (#2) * update configs * Update setup.py * update configs * fix bugs * update task * clean steup * update train and eval scripts * clean setup * update README * update training scripts * update configs * remove bc-frontier * format * format * add fsrl dependency * Update README.md * Update setup.py * update cdt * add bc frontier * clean up --------- Co-authored-by: Ja4822 <3471606159@qq.com> --- README.md | 78 ++++++-- examples/configs/bc_configs.py | 49 ++++- examples/configs/bcql_configs.py | 48 ++++- examples/configs/bearl_configs.py | 48 ++++- examples/configs/cdt_configs.py | 271 ++++++++++++++++---------- examples/configs/coptidice_configs.py | 50 ++++- examples/configs/cpq_configs.py | 46 ++++- examples/eval/eval_bc.py | 8 +- examples/eval/eval_bcql.py | 10 +- examples/eval/eval_bearl.py | 10 +- examples/eval/eval_cdt.py | 10 +- examples/eval/eval_coptidice.py | 10 +- examples/eval/eval_cpq.py | 10 +- examples/train/train_bc.py | 57 +++--- examples/train/train_bcql.py | 37 ++-- examples/train/train_bearl.py | 37 ++-- examples/train/train_cdt.py | 39 +++- examples/train/train_coptidice.py | 38 ++-- examples/train/train_cpq.py | 37 ++-- examples/train_all_tasks.py | 55 +++++- osrl/__init__.py | 3 +- osrl/algorithms/bc.py | 11 +- osrl/algorithms/bcql.py | 16 +- osrl/algorithms/bearl.py | 12 +- osrl/algorithms/cdt.py | 14 +- osrl/algorithms/coptidice.py | 13 +- osrl/algorithms/cpq.py | 14 +- osrl/common/__init__.py | 5 +- osrl/common/dataset.py | 210 +++++++++++++------- osrl/common/exp_util.py | 151 ++++++++++++++ osrl/common/net.py | 83 +++++--- setup.py | 53 +++-- 32 files changed, 1056 insertions(+), 477 deletions(-) create mode 100644 osrl/common/exp_util.py diff --git a/README.md b/README.md index 03f4b93..0913e30 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@
- +

@@ -7,16 +7,15 @@
![Python 3.8+](https://img.shields.io/badge/Python-3.8%2B-brightgreen.svg) - [![License](https://img.shields.io/badge/License-MIT-yellow.svg)](#license) + [![License](https://img.shields.io/badge/License-MIT-blue.svg)](#license) + [![PyPI](https://img.shields.io/pypi/v/osrl-lib?logo=pypi)](https://pypi.org/project/osrl-lib) + [![GitHub Repo Stars](https://img.shields.io/github/stars/liuzuxin/osrl?color=brightgreen&logo=github)](https://github.com/liuzuxin/osrl/stargazers) + [![Downloads](https://static.pepy.tech/personalized-badge/osrl-lib?period=total&left_color=grey&right_color=blue&left_text=downloads)](https://pepy.tech/project/osrl-lib) - - -
@@ -24,36 +23,79 @@ **OSRL (Offline Safe Reinforcement Learning)** offers a collection of elegant and extensible implementations of state-of-the-art offline safe reinforcement learning (RL) algorithms. Aimed at propelling research in offline safe RL, OSRL serves as a solid foundation to implement, benchmark, and iterate on safe RL solutions. -The OSRL package is a crucial component of our larger benchmarking suite for offline safe learning, which also includes [FSRL](https://github.com/liuzuxin/fsrl) and [DSRL](https://github.com/liuzuxin/dsrl), and is built to facilitate the development of robust and reliable offline safe RL solutions. +The OSRL package is a crucial component of our larger benchmarking suite for offline safe learning, which also includes [DSRL](https://github.com/liuzuxin/DSRL) and [FSRL](https://github.com/liuzuxin/FSRL), and is built to facilitate the development of robust and reliable offline safe RL solutions. To learn more, please visit our [project website](http://www.offline-saferl.org). ## Structure The structure of this repo is as follows: ``` -├── osrl # offline safe RL algorithms -│ ├── common_net.py -│ ├── common_util.py -│ ├── xx_algorithm.py -│ ├── xx_algorithm_util.py -│ ├── ... +├── examples +│ ├── configs # the training configs of each algorithm +│ ├── eval # the evaluation escipts +│ ├── train # the training scipts +├── osrl +│ ├── algorithms # offline safe RL algorithms +│ ├── common # base networks and utils ``` +The implemented offline safe RL and imitation learning algorithms include: + +| Algorithm | Type | Description | +|:-------------------:|:-----------------:|:------------------------:| +| BCQ-Lag | Q-learning | [BCQ](https://arxiv.org/pdf/1812.02900.pdf) with [PID Lagrangian](https://arxiv.org/abs/2007.03964) | +| BEAR-Lag | Q-learning | [BEARL](https://arxiv.org/abs/1906.00949) with [PID Lagrangian](https://arxiv.org/abs/2007.03964) | +| CPQ | Q-learning | [Constraints Penalized Q-learning (CPQ))](https://arxiv.org/abs/2107.09003) | +| COptiDICE | Distribution Correction Estimation | [Offline Constrained Policy Optimization via stationary DIstribution Correction Estimation](https://arxiv.org/abs/2204.08957) | +| CDT | Sequential Modeling | [Constrained Decision Transformer](https://arxiv.org/abs/2302.07351) | +| BC-All | Imitation Learning | [Behavior Cloning](https://arxiv.org/abs/2302.07351) with all datasets | +| BC-Safe | Imitation Learning | [Behavior Cloning](https://arxiv.org/abs/2302.07351) with safe trajectories | +| BC-Frontier | Imitation Learning | [Behavior Cloning](https://arxiv.org/abs/2302.07351) with high-reward trajectories | + ## Installation -Pull the repo and install: + +OSRL is currently hosted on [PyPI](https://pypi.org/project/osrl-lib), you can simply install it by: + +```bash +pip install osrl-lib ``` -git clone https://github.com/liuzuxin/osrl.git + +You can also pull the repo and install: +```bash +git clone https://github.com/liuzuxin/OSRL.git cd osrl pip install -e . ``` +If you want to use the `CDT` algorithm, please also manually install the `OApackage`: +```bash +pip install OApackage==2.7.6 +``` + ## How to use OSRL -The example usage are in the `examples` folder, where you can find the training and evaluation scripts for all the algorithms. +The example usage are in the `examples` folder, where you can find the training and evaluation scripts for all the algorithms. +All the parameters and their default configs for each algorithm are available in the `examples/configs` folder. +OSRL uses the `WandbLogger` in [FSRL](https://github.com/liuzuxin/FSRL) and [Pyrallis](https://github.com/eladrich/pyrallis) configuration system. The offline dataset and offline environments are provided in [DSRL](https://github.com/liuzuxin/DSRL), so make sure you install both of them first. +### Training For example, to train the `bcql` method, simply run by overriding the default parameters: ```shell -python examples/train/train_bcql.py --param1 args1 +python examples/train/train_bcql.py --task OfflineCarCirvle-v0 --param1 args1 ... ``` -All the parameters and their default configs for each algorithm are available in the `examples/configs` folder. \ No newline at end of file +By default, the config file and the logs during training will be written to `logs\` folder and the training plots can be viewed online using Wandb. + +You can also launch a sequence of experiments or in parallel via the [EasyRunner](https://github.com/liuzuxin/easy-runner) package, see `examples/train_all_tasks.py` for details. + +### Evaluation +To evaluate a trained agent, for example, a BCQ agent, simply run +``` +python example/eval/eval_bcql.py --path path_to_model --eval_episodes 20 +``` +It will load config file from `path_to_model/config.yaml` and model file from `path_to_model/checkpoints/model.pt`, run 20 episodes, and print the average normalized reward and cost. + + +## Contributing + +If you have any suggestions or find any bugs, please feel free to submit an issue or a pull request. We welcome contributions from the community! \ No newline at end of file diff --git a/examples/configs/bc_configs.py b/examples/configs/bc_configs.py index 677ee07..fd1245a 100644 --- a/examples/configs/bc_configs.py +++ b/examples/configs/bc_configs.py @@ -1,12 +1,13 @@ -from typing import Any, DefaultDict, Dict, List, Optional, Tuple from dataclasses import asdict, dataclass +from typing import Any, DefaultDict, Dict, List, Optional, Tuple + from pyrallis import field @dataclass class BCTrainConfig: # wandb params - project: str = "OSRL-baselines-new" + project: str = "OSRL-baselines" group: str = None name: Optional[str] = None prefix: Optional[str] = "BC" @@ -16,7 +17,7 @@ class BCTrainConfig: # dataset params outliers_percent: float = None noise_scale: float = None - inpaint_ranges: Tuple[Tuple[float, float], ...] = None + inpaint_ranges: Tuple[Tuple[float, float, float, float], ...] = None epsilon: float = None density: float = 1.0 # training params @@ -29,7 +30,7 @@ class BCTrainConfig: cost_limit: int = 10 episode_len: int = 300 batch_size: int = 512 - update_steps: int = 300_000 + update_steps: int = 100_000 num_workers: int = 8 bc_mode: str = "all" # "all", "safe", "risky", "frontier", "boundary", "multi-task" # model params @@ -80,6 +81,20 @@ class BCAntCircleConfig(BCTrainConfig): episode_len: int = 500 +@dataclass +class BCBallRunConfig(BCTrainConfig): + # training params + task: str = "OfflineBallRun-v0" + episode_len: int = 100 + + +@dataclass +class BCBallCircleConfig(BCTrainConfig): + # training params + task: str = "OfflineBallCircle-v0" + episode_len: int = 200 + + @dataclass class BCCarButton1Config(BCTrainConfig): # training params @@ -191,89 +206,113 @@ class BCPointPush2Config(BCTrainConfig): task: str = "OfflinePointPush2Gymnasium-v0" episode_len: int = 1000 + @dataclass class BCAntVelocityConfig(BCTrainConfig): # training params task: str = "OfflineAntVelocityGymnasium-v1" episode_len: int = 1000 + @dataclass class BCHalfCheetahVelocityConfig(BCTrainConfig): # training params task: str = "OfflineHalfCheetahVelocityGymnasium-v1" episode_len: int = 1000 + @dataclass class BCHopperVelocityConfig(BCTrainConfig): # training params task: str = "OfflineHopperVelocityGymnasium-v1" episode_len: int = 1000 + @dataclass class BCSwimmerVelocityConfig(BCTrainConfig): # training params task: str = "OfflineSwimmerVelocityGymnasium-v1" episode_len: int = 1000 + @dataclass class BCWalker2dVelocityConfig(BCTrainConfig): # training params task: str = "OfflineWalker2dVelocityGymnasium-v1" episode_len: int = 1000 + @dataclass class BCEasySparseConfig(BCTrainConfig): # training params task: str = "OfflineMetadrive-easysparse-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class BCEasyMeanConfig(BCTrainConfig): # training params task: str = "OfflineMetadrive-easymean-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class BCEasyDenseConfig(BCTrainConfig): # training params task: str = "OfflineMetadrive-easydense-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class BCMediumSparseConfig(BCTrainConfig): # training params task: str = "OfflineMetadrive-mediumsparse-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class BCMediumMeanConfig(BCTrainConfig): # training params task: str = "OfflineMetadrive-mediummean-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class BCMediumDenseConfig(BCTrainConfig): # training params task: str = "OfflineMetadrive-mediumdense-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class BCHardSparseConfig(BCTrainConfig): # training params task: str = "OfflineMetadrive-hardsparse-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class BCHardMeanConfig(BCTrainConfig): # training params task: str = "OfflineMetadrive-hardmean-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class BCHardDenseConfig(BCTrainConfig): # training params task: str = "OfflineMetadrive-harddense-v0" episode_len: int = 1000 + update_steps: int = 200_000 + BC_DEFAULT_CONFIG = { # bullet_safety_gym @@ -283,6 +322,8 @@ class BCHardDenseConfig(BCTrainConfig): "OfflineDroneCircle-v0": BCDroneCircleConfig, "OfflineCarRun-v0": BCCarRunConfig, "OfflineAntCircle-v0": BCAntCircleConfig, + "OfflineBallCircle-v0": BCBallCircleConfig, + "OfflineBallRun-v0": BCBallRunConfig, # safety_gymnasium: car "OfflineCarButton1Gymnasium-v0": BCCarButton1Config, "OfflineCarButton2Gymnasium-v0": BCCarButton2Config, diff --git a/examples/configs/bcql_configs.py b/examples/configs/bcql_configs.py index a946415..2d65b60 100644 --- a/examples/configs/bcql_configs.py +++ b/examples/configs/bcql_configs.py @@ -1,12 +1,13 @@ -from typing import Any, DefaultDict, Dict, List, Optional, Tuple from dataclasses import asdict, dataclass +from typing import Any, DefaultDict, Dict, List, Optional, Tuple + from pyrallis import field @dataclass class BCQLTrainConfig: # wandb params - project: str = "OSRL-baselines-new" + project: str = "OSRL-baselines" group: str = None name: Optional[str] = None prefix: Optional[str] = "BCQL" @@ -16,7 +17,7 @@ class BCQLTrainConfig: # dataset params outliers_percent: float = None noise_scale: float = None - inpaint_ranges: Tuple[Tuple[float, float], ...] = None + inpaint_ranges: Tuple[Tuple[float, float, float, float], ...] = None epsilon: float = None density: float = 1.0 # training params @@ -36,7 +37,7 @@ class BCQLTrainConfig: cost_limit: int = 10 episode_len: int = 300 batch_size: int = 512 - update_steps: int = 300_000 + update_steps: int = 100_000 num_workers: int = 8 # model params a_hidden_sizes: List[float] = field(default=[256, 256], is_mutable=True) @@ -93,6 +94,20 @@ class BCQLAntCircleConfig(BCQLTrainConfig): episode_len: int = 500 +@dataclass +class BCQLBallRunConfig(BCQLTrainConfig): + # training params + task: str = "OfflineBallRun-v0" + episode_len: int = 100 + + +@dataclass +class BCQLBallCircleConfig(BCQLTrainConfig): + # training params + task: str = "OfflineBallCircle-v0" + episode_len: int = 200 + + @dataclass class BCQLCarButton1Config(BCQLTrainConfig): # training params @@ -204,89 +219,112 @@ class BCQLPointPush2Config(BCQLTrainConfig): task: str = "OfflinePointPush2Gymnasium-v0" episode_len: int = 1000 + @dataclass class BCQLAntVelocityConfig(BCQLTrainConfig): # training params task: str = "OfflineAntVelocityGymnasium-v1" episode_len: int = 1000 + @dataclass class BCQLHalfCheetahVelocityConfig(BCQLTrainConfig): # training params task: str = "OfflineHalfCheetahVelocityGymnasium-v1" episode_len: int = 1000 + @dataclass class BCQLHopperVelocityConfig(BCQLTrainConfig): # training params task: str = "OfflineHopperVelocityGymnasium-v1" episode_len: int = 1000 + @dataclass class BCQLSwimmerVelocityConfig(BCQLTrainConfig): # training params task: str = "OfflineSwimmerVelocityGymnasium-v1" episode_len: int = 1000 + @dataclass class BCQLWalker2dVelocityConfig(BCQLTrainConfig): # training params task: str = "OfflineWalker2dVelocityGymnasium-v1" episode_len: int = 1000 + @dataclass class BCQLEasySparseConfig(BCQLTrainConfig): # training params task: str = "OfflineMetadrive-easysparse-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class BCQLEasyMeanConfig(BCQLTrainConfig): # training params task: str = "OfflineMetadrive-easymean-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class BCQLEasyDenseConfig(BCQLTrainConfig): # training params task: str = "OfflineMetadrive-easydense-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class BCQLMediumSparseConfig(BCQLTrainConfig): # training params task: str = "OfflineMetadrive-mediumsparse-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class BCQLMediumMeanConfig(BCQLTrainConfig): # training params task: str = "OfflineMetadrive-mediummean-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class BCQLMediumDenseConfig(BCQLTrainConfig): # training params task: str = "OfflineMetadrive-mediumdense-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class BCQLHardSparseConfig(BCQLTrainConfig): # training params task: str = "OfflineMetadrive-hardsparse-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class BCQLHardMeanConfig(BCQLTrainConfig): # training params task: str = "OfflineMetadrive-hardmean-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class BCQLHardDenseConfig(BCQLTrainConfig): # training params task: str = "OfflineMetadrive-harddense-v0" episode_len: int = 1000 + update_steps: int = 200_000 BCQL_DEFAULT_CONFIG = { @@ -297,6 +335,8 @@ class BCQLHardDenseConfig(BCQLTrainConfig): "OfflineDroneCircle-v0": BCQLDroneCircleConfig, "OfflineCarRun-v0": BCQLCarRunConfig, "OfflineAntCircle-v0": BCQLAntCircleConfig, + "OfflineBallCircle-v0": BCQLBallCircleConfig, + "OfflineBallRun-v0": BCQLBallRunConfig, # safety_gymnasium: car "OfflineCarButton1Gymnasium-v0": BCQLCarButton1Config, "OfflineCarButton2Gymnasium-v0": BCQLCarButton2Config, diff --git a/examples/configs/bearl_configs.py b/examples/configs/bearl_configs.py index 941d8be..997e9cf 100644 --- a/examples/configs/bearl_configs.py +++ b/examples/configs/bearl_configs.py @@ -1,12 +1,13 @@ -from typing import Any, DefaultDict, Dict, List, Optional, Tuple from dataclasses import asdict, dataclass +from typing import Any, DefaultDict, Dict, List, Optional, Tuple + from pyrallis import field @dataclass class BEARLTrainConfig: # wandb params - project: str = "OSRL-baselines-new" + project: str = "OSRL-baselines" group: str = None name: Optional[str] = None prefix: Optional[str] = "BEARL" @@ -16,7 +17,7 @@ class BEARLTrainConfig: # dataset params outliers_percent: float = None noise_scale: float = None - inpaint_ranges: Tuple[Tuple[float, float], ...] = None + inpaint_ranges: Tuple[Tuple[float, float, float, float], ...] = None epsilon: float = None density: float = 1.0 # training params @@ -48,7 +49,7 @@ class BEARLTrainConfig: target_mmd_thresh: float = 0.05 num_samples_mmd_match: int = 10 start_update_policy_step: int = 0 - kernel: str = "gaussian" # or "laplacian" + kernel: str = "gaussian" # or "laplacian" num_q: int = 2 num_qc: int = 2 PID: List[float] = field(default=[0.1, 0.003, 0.001], is_mutable=True) @@ -97,6 +98,20 @@ class BEARLAntCircleConfig(BEARLTrainConfig): episode_len: int = 500 +@dataclass +class BEARLBallRunConfig(BEARLTrainConfig): + # training params + task: str = "OfflineBallRun-v0" + episode_len: int = 100 + + +@dataclass +class BEARLBallCircleConfig(BEARLTrainConfig): + # training params + task: str = "OfflineBallCircle-v0" + episode_len: int = 200 + + @dataclass class BEARLCarButton1Config(BEARLTrainConfig): # training params @@ -208,89 +223,112 @@ class BEARLPointPush2Config(BEARLTrainConfig): task: str = "OfflinePointPush2Gymnasium-v0" episode_len: int = 1000 + @dataclass class BEARLAntVelocityConfig(BEARLTrainConfig): # training params task: str = "OfflineAntVelocityGymnasium-v1" episode_len: int = 1000 + @dataclass class BEARLHalfCheetahVelocityConfig(BEARLTrainConfig): # training params task: str = "OfflineHalfCheetahVelocityGymnasium-v1" episode_len: int = 1000 + @dataclass class BEARLHopperVelocityConfig(BEARLTrainConfig): # training params task: str = "OfflineHopperVelocityGymnasium-v1" episode_len: int = 1000 + @dataclass class BEARLSwimmerVelocityConfig(BEARLTrainConfig): # training params task: str = "OfflineSwimmerVelocityGymnasium-v1" episode_len: int = 1000 + @dataclass class BEARLWalker2dVelocityConfig(BEARLTrainConfig): # training params task: str = "OfflineWalker2dVelocityGymnasium-v1" episode_len: int = 1000 + @dataclass class BEARLEasySparseConfig(BEARLTrainConfig): # training params task: str = "OfflineMetadrive-easysparse-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class BEARLEasyMeanConfig(BEARLTrainConfig): # training params task: str = "OfflineMetadrive-easymean-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class BEARLEasyDenseConfig(BEARLTrainConfig): # training params task: str = "OfflineMetadrive-easydense-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class BEARLMediumSparseConfig(BEARLTrainConfig): # training params task: str = "OfflineMetadrive-mediumsparse-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class BEARLMediumMeanConfig(BEARLTrainConfig): # training params task: str = "OfflineMetadrive-mediummean-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class BEARLMediumDenseConfig(BEARLTrainConfig): # training params task: str = "OfflineMetadrive-mediumdense-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class BEARLHardSparseConfig(BEARLTrainConfig): # training params task: str = "OfflineMetadrive-hardsparse-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class BEARLHardMeanConfig(BEARLTrainConfig): # training params task: str = "OfflineMetadrive-hardmean-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class BEARLHardDenseConfig(BEARLTrainConfig): # training params task: str = "OfflineMetadrive-harddense-v0" episode_len: int = 1000 + update_steps: int = 200_000 BEARL_DEFAULT_CONFIG = { @@ -301,6 +339,8 @@ class BEARLHardDenseConfig(BEARLTrainConfig): "OfflineDroneCircle-v0": BEARLDroneCircleConfig, "OfflineCarRun-v0": BEARLCarRunConfig, "OfflineAntCircle-v0": BEARLAntCircleConfig, + "OfflineBallCircle-v0": BEARLBallCircleConfig, + "OfflineBallRun-v0": BEARLBallRunConfig, # safety_gymnasium "OfflineCarButton1Gymnasium-v0": BEARLCarButton1Config, "OfflineCarButton2Gymnasium-v0": BEARLCarButton2Config, diff --git a/examples/configs/cdt_configs.py b/examples/configs/cdt_configs.py index fc36140..0f4edff 100644 --- a/examples/configs/cdt_configs.py +++ b/examples/configs/cdt_configs.py @@ -1,11 +1,11 @@ -from typing import Any, DefaultDict, Dict, List, Optional, Tuple from dataclasses import asdict, dataclass +from typing import Any, DefaultDict, Dict, List, Optional, Tuple @dataclass class CDTTrainConfig: # wandb params - project: str = "OSRL-baselines-new" + project: str = "OSRL-baselines" group: str = None name: Optional[str] = None prefix: Optional[str] = "CDT" @@ -43,9 +43,8 @@ class CDTTrainConfig: cost_scale: float = 1 num_workers: int = 8 # evaluation params - target_returns: Tuple[Tuple[float, ...], ...] = ((450.0, 10), - (500.0, 20), - (550.0, 50)) # reward, cost + target_returns: Tuple[Tuple[float, ...], + ...] = ((450.0, 10), (500.0, 20), (550.0, 50)) # reward, cost cost_limit: int = 10 eval_episodes: int = 10 eval_every: int = 2500 @@ -62,7 +61,7 @@ class CDTTrainConfig: max_reward: float = 600.0 # minimum reward above the PF curve min_reward: float = 1.0 - # the max drecrease of ret between the associated traj + # the max drecrease of ret between the associated traj # w.r.t the nearest pf traj max_rew_decrease: float = 100.0 # model mode params @@ -111,9 +110,8 @@ class CDTAntRunConfig(CDTTrainConfig): episode_len: int = 200 # training params task: str = "OfflineAntRun-v0" - target_returns: Tuple[Tuple[float, ...], ...] = ((700.0, 10), - (750.0, 30), - (800.0, 70)) + target_returns: Tuple[Tuple[float, ...], + ...] = ((700.0, 10), (750.0, 20), (800.0, 40)) # augmentation param deg: int = 3 max_reward: float = 1000.0 @@ -128,9 +126,8 @@ class CDTDroneRunConfig(CDTTrainConfig): episode_len: int = 200 # training params task: str = "OfflineDroneRun-v0" - target_returns: Tuple[Tuple[float, ...], ...] = ((400.0, 10), - (500.0, 30), - (600.0, 70)) + target_returns: Tuple[Tuple[float, ...], + ...] = ((400.0, 10), (500.0, 20), (600.0, 40)) # augmentation param deg: int = 1 max_reward: float = 700.0 @@ -146,9 +143,8 @@ class CDTDroneCircleConfig(CDTTrainConfig): episode_len: int = 300 # training params task: str = "OfflineDroneCircle-v0" - target_returns: Tuple[Tuple[float, ...], ...] = ((700.0, 10), - (750.0, 20), - (800.0, 50)) + target_returns: Tuple[Tuple[float, ...], + ...] = ((700.0, 10), (750.0, 20), (800.0, 40)) # augmentation param deg: int = 1 max_reward: float = 1000.0 @@ -164,15 +160,14 @@ class CDTCarRunConfig(CDTTrainConfig): episode_len: int = 200 # training params task: str = "OfflineCarRun-v0" - target_returns: Tuple[Tuple[float, ...], ...] = ((300.0, 0), (300, 10), - (300, 20), (400, 20), - (500, 20), (300, 40), - (400, 40), (500, 40), - (500, 60)) + target_returns: Tuple[Tuple[float, ...], + ...] = ((575.0, 10), (575.0, 20), (575.0, 40)) # augmentation param + deg: int = 0 max_reward: float = 600.0 max_rew_decrease: float = 100 min_reward: float = 1 + device: str = "cuda:3" @dataclass @@ -182,9 +177,8 @@ class CDTAntCircleConfig(CDTTrainConfig): episode_len: int = 500 # training params task: str = "OfflineAntCircle-v0" - target_returns: Tuple[Tuple[float, ...], ...] = ((300.0, 10), - (350.0, 50), - (400.0, 100)) + target_returns: Tuple[Tuple[float, ...], + ...] = ((300.0, 10), (350.0, 20), (400.0, 40)) # augmentation param deg: int = 2 max_reward: float = 500.0 @@ -194,21 +188,37 @@ class CDTAntCircleConfig(CDTTrainConfig): @dataclass -class CDTCarReachConfig(CDTTrainConfig): +class CDTBallRunConfig(CDTTrainConfig): + # model params + seq_len: int = 10 + episode_len: int = 100 + # training params + task: str = "OfflineBallRun-v0" + target_returns: Tuple[Tuple[float, ...], + ...] = ((500.0, 10), (500.0, 20), (700.0, 40)) + # augmentation param + deg: int = 2 + max_reward: float = 1400.0 + max_rew_decrease: float = 200 + min_reward: float = 1 + device: str = "cuda:2" + + +@dataclass +class CDTBallCircleConfig(CDTTrainConfig): # model params seq_len: int = 10 episode_len: int = 200 # training params - task: str = "OfflineCarReach-v0" - target_returns: Tuple[Tuple[float, ...], ...] = ((300.0, 0), (300, 10), - (300, 20), (400, 20), - (500, 20), (300, 40), - (400, 40), (500, 40), - (500, 60)) + task: str = "OfflineBallCircle-v0" + target_returns: Tuple[Tuple[float, ...], + ...] = ((700.0, 10), (750.0, 20), (800.0, 40)) # augmentation param - max_reward: float = 300.0 + deg: int = 2 + max_reward: float = 1000.0 max_rew_decrease: float = 200 min_reward: float = 1 + device: str = "cuda:1" @dataclass @@ -218,9 +228,7 @@ class CDTCarButton1Config(CDTTrainConfig): episode_len: int = 1000 # training params task: str = "OfflineCarButton1Gymnasium-v0" - target_returns: Tuple[Tuple[float, ...], ...] = ((35.0, 40), - (35.0, 80), - (35.0, 120)) + target_returns: Tuple[Tuple[float, ...], ...] = ((35.0, 20), (35.0, 40), (35.0, 80)) # augmentation param deg: int = 0 max_reward: float = 45.0 @@ -236,9 +244,7 @@ class CDTCarButton2Config(CDTTrainConfig): episode_len: int = 1000 # training params task: str = "OfflineCarButton2Gymnasium-v0" - target_returns: Tuple[Tuple[float, ...], ...] = ((40.0, 40), - (40.0, 80), - (40.0, 120)) + target_returns: Tuple[Tuple[float, ...], ...] = ((40.0, 20), (40.0, 40), (40.0, 80)) # augmentation param deg: int = 0 max_reward: float = 50.0 @@ -254,9 +260,7 @@ class CDTCarCircle1Config(CDTTrainConfig): episode_len: int = 500 # training params task: str = "OfflineCarCircle1Gymnasium-v0" - target_returns: Tuple[Tuple[float, ...], ...] = ((20.0, 40), - (22.5, 80), - (25.0, 120)) + target_returns: Tuple[Tuple[float, ...], ...] = ((20.0, 20), (22.5, 40), (25.0, 80)) # augmentation param deg: int = 1 max_reward: float = 30.0 @@ -272,9 +276,7 @@ class CDTCarCircle2Config(CDTTrainConfig): episode_len: int = 500 # training params task: str = "OfflineCarCircle2Gymnasium-v0" - target_returns: Tuple[Tuple[float, ...], ...] = ((20.0, 40), - (21.0, 80), - (22.0, 120)) + target_returns: Tuple[Tuple[float, ...], ...] = ((20.0, 20), (21.0, 40), (22.0, 80)) # augmentation param deg: int = 1 max_reward: float = 30.0 @@ -290,9 +292,7 @@ class CDTCarGoal1Config(CDTTrainConfig): episode_len: int = 1000 # training params task: str = "OfflineCarGoal1Gymnasium-v0" - target_returns: Tuple[Tuple[float, ...], ...] = ((40.0, 20), - (40.0, 40), - (40.0, 80)) + target_returns: Tuple[Tuple[float, ...], ...] = ((40.0, 20), (40.0, 40), (40.0, 80)) # augmentation param deg: int = 1 max_reward: float = 50.0 @@ -308,9 +308,7 @@ class CDTCarGoal2Config(CDTTrainConfig): episode_len: int = 1000 # training params task: str = "OfflineCarGoal2Gymnasium-v0" - target_returns: Tuple[Tuple[float, ...], ...] = ((30.0, 40), - (30.0, 80), - (30.0, 120)) + target_returns: Tuple[Tuple[float, ...], ...] = ((30.0, 20), (30.0, 40), (30.0, 80)) # augmentation param deg: int = 1 max_reward: float = 35.0 @@ -326,9 +324,7 @@ class CDTCarPush1Config(CDTTrainConfig): episode_len: int = 1000 # training params task: str = "OfflineCarPush1Gymnasium-v0" - target_returns: Tuple[Tuple[float, ...], ...] = ((15.0, 40), - (15.0, 80), - (15.0, 120)) + target_returns: Tuple[Tuple[float, ...], ...] = ((15.0, 20), (15.0, 40), (15.0, 80)) # augmentation param deg: int = 0 max_reward: float = 20.0 @@ -344,9 +340,7 @@ class CDTCarPush2Config(CDTTrainConfig): episode_len: int = 1000 # training params task: str = "OfflineCarPush2Gymnasium-v0" - target_returns: Tuple[Tuple[float, ...], ...] = ((12.0, 40), - (12.0, 80), - (12.0, 120)) + target_returns: Tuple[Tuple[float, ...], ...] = ((12.0, 20), (12.0, 40), (12.0, 80)) # augmentation param deg: int = 0 max_reward: float = 15.0 @@ -362,9 +356,7 @@ class CDTPointButton1Config(CDTTrainConfig): episode_len: int = 1000 # training params task: str = "OfflinePointButton1Gymnasium-v0" - target_returns: Tuple[Tuple[float, ...], ...] = ((40.0, 40), - (40.0, 80), - (40.0, 120)) + target_returns: Tuple[Tuple[float, ...], ...] = ((40.0, 20), (40.0, 40), (40.0, 80)) # augmentation param deg: int = 0 max_reward: float = 45.0 @@ -380,9 +372,7 @@ class CDTPointButton2Config(CDTTrainConfig): episode_len: int = 1000 # training params task: str = "OfflinePointButton2Gymnasium-v0" - target_returns: Tuple[Tuple[float, ...], ...] = ((40.0, 40), - (40.0, 80), - (40.0, 120)) + target_returns: Tuple[Tuple[float, ...], ...] = ((40.0, 20), (40.0, 40), (40.0, 80)) # augmentation param deg: int = 0 max_reward: float = 50.0 @@ -398,9 +388,7 @@ class CDTPointCircle1Config(CDTTrainConfig): episode_len: int = 500 # training params task: str = "OfflinePointCircle1Gymnasium-v0" - target_returns: Tuple[Tuple[float, ...], ...] = ((50.0, 40), - (52.5, 80), - (55.0, 120)) + target_returns: Tuple[Tuple[float, ...], ...] = ((50.0, 20), (52.5, 40), (55.0, 80)) # augmentation param deg: int = 1 max_reward: float = 65.0 @@ -416,9 +404,7 @@ class CDTPointCircle2Config(CDTTrainConfig): episode_len: int = 500 # training params task: str = "OfflinePointCircle2Gymnasium-v0" - target_returns: Tuple[Tuple[float, ...], ...] = ((45.0, 40), - (47.5, 80), - (50.0, 120)) + target_returns: Tuple[Tuple[float, ...], ...] = ((45.0, 20), (47.5, 40), (50.0, 80)) # augmentation param deg: int = 1 max_reward: float = 55.0 @@ -434,9 +420,7 @@ class CDTPointGoal1Config(CDTTrainConfig): episode_len: int = 1000 # training params task: str = "OfflinePointGoal1Gymnasium-v0" - target_returns: Tuple[Tuple[float, ...], ...] = ((30.0, 20), - (30.0, 40), - (30.0, 80)) + target_returns: Tuple[Tuple[float, ...], ...] = ((30.0, 20), (30.0, 40), (30.0, 80)) # augmentation param deg: int = 0 max_reward: float = 35.0 @@ -452,9 +436,7 @@ class CDTPointGoal2Config(CDTTrainConfig): episode_len: int = 1000 # training params task: str = "OfflinePointGoal2Gymnasium-v0" - target_returns: Tuple[Tuple[float, ...], ...] = ((30.0, 40), - (30.0, 80), - (30.0, 120)) + target_returns: Tuple[Tuple[float, ...], ...] = ((30.0, 20), (30.0, 40), (30.0, 80)) # augmentation param deg: int = 1 max_reward: float = 35.0 @@ -470,9 +452,7 @@ class CDTPointPush1Config(CDTTrainConfig): episode_len: int = 1000 # training params task: str = "OfflinePointPush1Gymnasium-v0" - target_returns: Tuple[Tuple[float, ...], ...] = ((15.0, 40), - (15.0, 80), - (15.0, 120)) + target_returns: Tuple[Tuple[float, ...], ...] = ((15.0, 20), (15.0, 40), (15.0, 80)) # augmentation param deg: int = 0 max_reward: float = 20.0 @@ -488,9 +468,7 @@ class CDTPointPush2Config(CDTTrainConfig): episode_len: int = 1000 # training params task: str = "OfflinePointPush2Gymnasium-v0" - target_returns: Tuple[Tuple[float, ...], ...] = ((12.0, 40), - (12.0, 80), - (12.0, 120)) + target_returns: Tuple[Tuple[float, ...], ...] = ((12.0, 20), (12.0, 40), (12.0, 80)) # augmentation param deg: int = 0 max_reward: float = 15.0 @@ -501,9 +479,20 @@ class CDTPointPush2Config(CDTTrainConfig): @dataclass class CDTAntVelocityConfig(CDTTrainConfig): + # model params + seq_len: int = 10 + episode_len: int = 1000 # training params task: str = "OfflineAntVelocityGymnasium-v1" - episode_len: int = 1000 + target_returns: Tuple[Tuple[float, ...], + ...] = ((2800.0, 20), (2800.0, 40), (2800.0, 80)) + # augmentation param + deg: int = 1 + max_reward: float = 3000.0 + max_rew_decrease: float = 500 + min_reward: float = 1 + device: str = "cuda:1" + @dataclass class CDTHalfCheetahVelocityConfig(CDTTrainConfig): @@ -512,9 +501,8 @@ class CDTHalfCheetahVelocityConfig(CDTTrainConfig): episode_len: int = 1000 # training params task: str = "OfflineHalfCheetahVelocityGymnasium-v1" - target_returns: Tuple[Tuple[float, ...], ...] = ((3000.0, 20), - (3000.0, 40), - (3000.0, 80)) + target_returns: Tuple[Tuple[float, ...], + ...] = ((3000.0, 20), (3000.0, 40), (3000.0, 80)) # augmentation param deg: int = 1 max_reward: float = 3000.0 @@ -522,11 +510,23 @@ class CDTHalfCheetahVelocityConfig(CDTTrainConfig): min_reward: float = 1 device: str = "cuda:2" + @dataclass class CDTHopperVelocityConfig(CDTTrainConfig): + # model params + seq_len: int = 10 + episode_len: int = 1000 # training params task: str = "OfflineHopperVelocityGymnasium-v1" - episode_len: int = 1000 + target_returns: Tuple[Tuple[float, ...], + ...] = ((1750.0, 20), (1750.0, 40), (1750.0, 80)) + # augmentation param + deg: int = 1 + max_reward: float = 2000.0 + max_rew_decrease: float = 300 + min_reward: float = 1 + device: str = "cuda:2" + @dataclass class CDTSwimmerVelocityConfig(CDTTrainConfig): @@ -535,9 +535,8 @@ class CDTSwimmerVelocityConfig(CDTTrainConfig): episode_len: int = 1000 # training params task: str = "OfflineSwimmerVelocityGymnasium-v1" - target_returns: Tuple[Tuple[float, ...], ...] = ((160.0, 20), - (160.0, 40), - (160.0, 80)) + target_returns: Tuple[Tuple[float, ...], + ...] = ((160.0, 20), (160.0, 40), (160.0, 80)) # augmentation param deg: int = 1 max_reward: float = 250.0 @@ -545,6 +544,7 @@ class CDTSwimmerVelocityConfig(CDTTrainConfig): min_reward: float = 1 device: str = "cuda:2" + @dataclass class CDTWalker2dVelocityConfig(CDTTrainConfig): # model params @@ -552,9 +552,8 @@ class CDTWalker2dVelocityConfig(CDTTrainConfig): episode_len: int = 1000 # training params task: str = "OfflineWalker2dVelocityGymnasium-v1" - target_returns: Tuple[Tuple[float, ...], ...] = ((2800.0, 20), - (2800.0, 40), - (2800.0, 80)) + target_returns: Tuple[Tuple[float, ...], + ...] = ((2800.0, 20), (2800.0, 40), (2800.0, 80)) # augmentation param deg: int = 1 max_reward: float = 3600.0 @@ -565,16 +564,38 @@ class CDTWalker2dVelocityConfig(CDTTrainConfig): @dataclass class CDTEasySparseConfig(CDTTrainConfig): + # model params + seq_len: int = 10 + episode_len: int = 1000 # training params task: str = "OfflineMetadrive-easysparse-v0" - episode_len: int = 1000 + update_steps: int = 200_000 + target_returns: Tuple[Tuple[float, ...], + ...] = ((300.0, 10), (350.0, 20), (400.0, 40)) + # augmentation param + deg: int = 2 + max_reward: float = 500.0 + max_rew_decrease: float = 100 + min_reward: float = 1 + device: str = "cuda:3" @dataclass class CDTEasyMeanConfig(CDTTrainConfig): + # model params + seq_len: int = 10 + episode_len: int = 1000 # training params task: str = "OfflineMetadrive-easymean-v0" - episode_len: int = 1000 + update_steps: int = 200_000 + target_returns: Tuple[Tuple[float, ...], + ...] = ((300.0, 10), (350.0, 20), (400.0, 40)) + # augmentation param + deg: int = 2 + max_reward: float = 500.0 + max_rew_decrease: float = 100 + min_reward: float = 1 + device: str = "cuda:3" @dataclass @@ -584,9 +605,9 @@ class CDTEasyDenseConfig(CDTTrainConfig): episode_len: int = 1000 # training params task: str = "OfflineMetadrive-easydense-v0" - target_returns: Tuple[Tuple[float, ...], ...] = ((300.0, 10), - (350.0, 20), - (400.0, 40)) + update_steps: int = 200_000 + target_returns: Tuple[Tuple[float, ...], + ...] = ((300.0, 10), (350.0, 20), (400.0, 40)) # augmentation param deg: int = 2 max_reward: float = 500.0 @@ -597,9 +618,20 @@ class CDTEasyDenseConfig(CDTTrainConfig): @dataclass class CDTMediumSparseConfig(CDTTrainConfig): + # model params + seq_len: int = 10 + episode_len: int = 1000 # training params task: str = "OfflineMetadrive-mediumsparse-v0" - episode_len: int = 1000 + update_steps: int = 200_000 + target_returns: Tuple[Tuple[float, ...], + ...] = ((300.0, 10), (300.0, 20), (300.0, 40)) + # augmentation param + deg: int = 0 + max_reward: float = 300.0 + max_rew_decrease: float = 100 + min_reward: float = 1 + device: str = "cuda:3" @dataclass @@ -609,9 +641,9 @@ class CDTMediumMeanConfig(CDTTrainConfig): episode_len: int = 1000 # training params task: str = "OfflineMetadrive-mediummean-v0" - target_returns: Tuple[Tuple[float, ...], ...] = ((300.0, 10), - (300.0, 20), - (300.0, 40)) + update_steps: int = 200_000 + target_returns: Tuple[Tuple[float, ...], + ...] = ((300.0, 10), (300.0, 20), (300.0, 40)) # augmentation param deg: int = 0 max_reward: float = 300.0 @@ -625,6 +657,7 @@ class CDTMediumDenseConfig(CDTTrainConfig): # training params task: str = "OfflineMetadrive-mediumdense-v0" episode_len: int = 1000 + update_steps: int = 200_000 @dataclass @@ -634,11 +667,11 @@ class CDTHardSparseConfig(CDTTrainConfig): episode_len: int = 1000 # training params task: str = "OfflineMetadrive-hardsparse-v0" - target_returns: Tuple[Tuple[float, ...], ...] = ((300.0, 10), - (350.0, 20), - (400.0, 40)) + update_steps: int = 200_000 + target_returns: Tuple[Tuple[float, ...], + ...] = ((300.0, 10), (350.0, 20), (400.0, 40)) # augmentation param - deg: int = 2 + deg: int = 1 max_reward: float = 500.0 max_rew_decrease: float = 100 min_reward: float = 1 @@ -647,18 +680,40 @@ class CDTHardSparseConfig(CDTTrainConfig): @dataclass class CDTHardMeanConfig(CDTTrainConfig): + # model params + seq_len: int = 10 + episode_len: int = 1000 # training params task: str = "OfflineMetadrive-hardmean-v0" - episode_len: int = 1000 + update_steps: int = 200_000 + target_returns: Tuple[Tuple[float, ...], + ...] = ((300.0, 10), (350.0, 20), (400.0, 40)) + # augmentation param + deg: int = 1 + max_reward: float = 500.0 + max_rew_decrease: float = 100 + min_reward: float = 1 + device: str = "cuda:2" @dataclass class CDTHardDenseConfig(CDTTrainConfig): + # model params + seq_len: int = 10 + episode_len: int = 1000 # training params task: str = "OfflineMetadrive-harddense-v0" - episode_len: int = 1000 + update_steps: int = 200_000 + target_returns: Tuple[Tuple[float, ...], + ...] = ((300.0, 10), (350.0, 20), (400.0, 40)) + # augmentation param + deg: int = 1 + max_reward: float = 500.0 + max_rew_decrease: float = 100 + min_reward: float = 1 + device: str = "cuda:2" + - CDT_DEFAULT_CONFIG = { # bullet_safety_gym "OfflineCarCircle-v0": CDTCarCircleConfig, @@ -667,6 +722,8 @@ class CDTHardDenseConfig(CDTTrainConfig): "OfflineDroneCircle-v0": CDTDroneCircleConfig, "OfflineCarRun-v0": CDTCarRunConfig, "OfflineAntCircle-v0": CDTAntCircleConfig, + "OfflineBallCircle-v0": CDTBallCircleConfig, + "OfflineBallRun-v0": CDTBallRunConfig, # safety_gymnasium "OfflineCarButton1Gymnasium-v0": CDTCarButton1Config, "OfflineCarButton2Gymnasium-v0": CDTCarButton2Config, @@ -701,4 +758,4 @@ class CDTHardDenseConfig(CDTTrainConfig): "OfflineMetadrive-hardsparse-v0": CDTHardSparseConfig, "OfflineMetadrive-hardmean-v0": CDTHardMeanConfig, "OfflineMetadrive-harddense-v0": CDTHardDenseConfig -} \ No newline at end of file +} diff --git a/examples/configs/coptidice_configs.py b/examples/configs/coptidice_configs.py index 857b78e..7ebd94c 100644 --- a/examples/configs/coptidice_configs.py +++ b/examples/configs/coptidice_configs.py @@ -1,12 +1,13 @@ -from typing import Any, DefaultDict, Dict, List, Optional, Tuple from dataclasses import asdict, dataclass +from typing import Any, DefaultDict, Dict, List, Optional, Tuple + from pyrallis import field @dataclass class COptiDICETrainConfig: # wandb params - project: str = "OSRL-baselines-new" + project: str = "OSRL-baselines" group: str = None name: Optional[str] = None prefix: Optional[str] = "COptiDICE" @@ -16,7 +17,7 @@ class COptiDICETrainConfig: # dataset params outliers_percent: float = None noise_scale: float = None - inpaint_ranges: Tuple[Tuple[float, float], ...] = None + inpaint_ranges: Tuple[Tuple[float, float, float, float], ...] = None epsilon: float = None density: float = 1.0 # training params @@ -33,7 +34,7 @@ class COptiDICETrainConfig: cost_limit: int = 10 episode_len: int = 300 batch_size: int = 512 - update_steps: int = 300_000 + update_steps: int = 100_000 num_workers: int = 8 # model params a_hidden_sizes: List[float] = field(default=[256, 256], is_mutable=True) @@ -89,6 +90,20 @@ class COptiDICEAntCircleConfig(COptiDICETrainConfig): episode_len: int = 500 +@dataclass +class COptiDICEBallRunConfig(COptiDICETrainConfig): + # training params + task: str = "OfflineBallRun-v0" + episode_len: int = 100 + + +@dataclass +class COptiDICEBallCircleConfig(COptiDICETrainConfig): + # training params + task: str = "OfflineBallCircle-v0" + episode_len: int = 200 + + @dataclass class COptiDICECarButton1Config(COptiDICETrainConfig): # training params @@ -200,89 +215,112 @@ class COptiDICEPointPush2Config(COptiDICETrainConfig): task: str = "OfflinePointPush2Gymnasium-v0" episode_len: int = 1000 + @dataclass class COptiDICEAntVelocityConfig(COptiDICETrainConfig): # training params task: str = "OfflineAntVelocityGymnasium-v1" episode_len: int = 1000 + @dataclass class COptiDICEHalfCheetahVelocityConfig(COptiDICETrainConfig): # training params task: str = "OfflineHalfCheetahVelocityGymnasium-v1" episode_len: int = 1000 + @dataclass class COptiDICEHopperVelocityConfig(COptiDICETrainConfig): # training params task: str = "OfflineHopperVelocityGymnasium-v1" episode_len: int = 1000 + @dataclass class COptiDICESwimmerVelocityConfig(COptiDICETrainConfig): # training params task: str = "OfflineSwimmerVelocityGymnasium-v1" episode_len: int = 1000 + @dataclass class COptiDICEWalker2dVelocityConfig(COptiDICETrainConfig): # training params task: str = "OfflineWalker2dVelocityGymnasium-v1" episode_len: int = 1000 + @dataclass class COptiDICEEasySparseConfig(COptiDICETrainConfig): # training params task: str = "OfflineMetadrive-easysparse-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class COptiDICEEasyMeanConfig(COptiDICETrainConfig): # training params task: str = "OfflineMetadrive-easymean-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class COptiDICEEasyDenseConfig(COptiDICETrainConfig): # training params task: str = "OfflineMetadrive-easydense-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class COptiDICEMediumSparseConfig(COptiDICETrainConfig): # training params task: str = "OfflineMetadrive-mediumsparse-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class COptiDICEMediumMeanConfig(COptiDICETrainConfig): # training params task: str = "OfflineMetadrive-mediummean-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class COptiDICEMediumDenseConfig(COptiDICETrainConfig): # training params task: str = "OfflineMetadrive-mediumdense-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class COptiDICEHardSparseConfig(COptiDICETrainConfig): # training params task: str = "OfflineMetadrive-hardsparse-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class COptiDICEHardMeanConfig(COptiDICETrainConfig): # training params task: str = "OfflineMetadrive-hardmean-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class COptiDICEHardDenseConfig(COptiDICETrainConfig): # training params task: str = "OfflineMetadrive-harddense-v0" episode_len: int = 1000 + update_steps: int = 200_000 COptiDICE_DEFAULT_CONFIG = { @@ -293,6 +331,8 @@ class COptiDICEHardDenseConfig(COptiDICETrainConfig): "OfflineDroneCircle-v0": COptiDICEDroneCircleConfig, "OfflineCarRun-v0": COptiDICECarRunConfig, "OfflineAntCircle-v0": COptiDICEAntCircleConfig, + "OfflineBallCircle-v0": COptiDICEBallCircleConfig, + "OfflineBallRun-v0": COptiDICEBallRunConfig, # safety_gymnasium "OfflineCarButton1Gymnasium-v0": COptiDICECarButton1Config, "OfflineCarButton2Gymnasium-v0": COptiDICECarButton2Config, @@ -327,4 +367,4 @@ class COptiDICEHardDenseConfig(COptiDICETrainConfig): "OfflineMetadrive-hardsparse-v0": COptiDICEHardSparseConfig, "OfflineMetadrive-hardmean-v0": COptiDICEHardMeanConfig, "OfflineMetadrive-harddense-v0": COptiDICEHardDenseConfig -} \ No newline at end of file +} diff --git a/examples/configs/cpq_configs.py b/examples/configs/cpq_configs.py index e7c5204..598c881 100644 --- a/examples/configs/cpq_configs.py +++ b/examples/configs/cpq_configs.py @@ -1,12 +1,13 @@ -from typing import Any, DefaultDict, Dict, List, Optional, Tuple from dataclasses import asdict, dataclass +from typing import Any, DefaultDict, Dict, List, Optional, Tuple + from pyrallis import field @dataclass class CPQTrainConfig: # wandb params - project: str = "OSRL-baselines-new" + project: str = "OSRL-baselines" group: str = None name: Optional[str] = None prefix: Optional[str] = "CPQ" @@ -16,7 +17,7 @@ class CPQTrainConfig: # dataset params outliers_percent: float = None noise_scale: float = None - inpaint_ranges: Tuple[Tuple[float, float], ...] = None + inpaint_ranges: Tuple[Tuple[float, float, float, float], ...] = None epsilon: float = None density: float = 1.0 # training params @@ -34,7 +35,7 @@ class CPQTrainConfig: cost_limit: int = 10 episode_len: int = 300 batch_size: int = 512 - update_steps: int = 300_000 + update_steps: int = 100_000 num_workers: int = 8 # model params a_hidden_sizes: List[float] = field(default=[256, 256], is_mutable=True) @@ -92,6 +93,19 @@ class CPQAntCircleConfig(CPQTrainConfig): episode_len: int = 500 +@dataclass +class CPQBallRunConfig(CPQTrainConfig): + # training params + task: str = "OfflineBallRun-v0" + episode_len: int = 100 + + +@dataclass +class CPQBallCircleConfig(CPQTrainConfig): + # training params + task: str = "OfflineBallCircle-v0" + episode_len: int = 200 + @dataclass class CPQCarButton1Config(CPQTrainConfig): @@ -211,83 +225,105 @@ class CPQAntVelocityConfig(CPQTrainConfig): task: str = "OfflineAntVelocityGymnasium-v1" episode_len: int = 1000 + @dataclass class CPQHalfCheetahVelocityConfig(CPQTrainConfig): # training params task: str = "OfflineHalfCheetahVelocityGymnasium-v1" episode_len: int = 1000 + @dataclass class CPQHopperVelocityConfig(CPQTrainConfig): # training params task: str = "OfflineHopperVelocityGymnasium-v1" episode_len: int = 1000 + @dataclass class CPQSwimmerVelocityConfig(CPQTrainConfig): # training params task: str = "OfflineSwimmerVelocityGymnasium-v1" episode_len: int = 1000 + @dataclass class CPQWalker2dVelocityConfig(CPQTrainConfig): # training params task: str = "OfflineWalker2dVelocityGymnasium-v1" episode_len: int = 1000 + @dataclass class CPQEasySparseConfig(CPQTrainConfig): # training params task: str = "OfflineMetadrive-easysparse-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class CPQEasyMeanConfig(CPQTrainConfig): # training params task: str = "OfflineMetadrive-easymean-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class CPQEasyDenseConfig(CPQTrainConfig): # training params task: str = "OfflineMetadrive-easydense-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class CPQMediumSparseConfig(CPQTrainConfig): # training params task: str = "OfflineMetadrive-mediumsparse-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class CPQMediumMeanConfig(CPQTrainConfig): # training params task: str = "OfflineMetadrive-mediummean-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class CPQMediumDenseConfig(CPQTrainConfig): # training params task: str = "OfflineMetadrive-mediumdense-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class CPQHardSparseConfig(CPQTrainConfig): # training params task: str = "OfflineMetadrive-hardsparse-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class CPQHardMeanConfig(CPQTrainConfig): # training params task: str = "OfflineMetadrive-hardmean-v0" episode_len: int = 1000 + update_steps: int = 200_000 + @dataclass class CPQHardDenseConfig(CPQTrainConfig): # training params task: str = "OfflineMetadrive-harddense-v0" episode_len: int = 1000 + update_steps: int = 200_000 CPQ_DEFAULT_CONFIG = { @@ -298,6 +334,8 @@ class CPQHardDenseConfig(CPQTrainConfig): "OfflineDroneCircle-v0": CPQDroneCircleConfig, "OfflineCarRun-v0": CPQCarRunConfig, "OfflineAntCircle-v0": CPQAntCircleConfig, + "OfflineBallCircle-v0": CPQBallCircleConfig, + "OfflineBallRun-v0": CPQBallRunConfig, # safety_gymnasium "OfflineCarButton1Gymnasium-v0": CPQCarButton1Config, "OfflineCarButton2Gymnasium-v0": CPQCarButton2Config, diff --git a/examples/eval/eval_bc.py b/examples/eval/eval_bc.py index e36a0f3..df04042 100644 --- a/examples/eval/eval_bc.py +++ b/examples/eval/eval_bc.py @@ -1,15 +1,15 @@ -from typing import Any, DefaultDict, Dict, List, Optional, Tuple from dataclasses import asdict, dataclass +from typing import Any, DefaultDict, Dict, List, Optional, Tuple -import gymnasium as gym # noqa import dsrl +import gymnasium as gym # noqa import numpy as np import pyrallis -from pyrallis import field import torch +from pyrallis import field from osrl.algorithms import BC, BCTrainer -from fsrl.utils.exp_util import load_config_and_model, seed_all +from osrl.common.exp_util import load_config_and_model, seed_all @dataclass diff --git a/examples/eval/eval_bcql.py b/examples/eval/eval_bcql.py index 15df384..4d96e2c 100644 --- a/examples/eval/eval_bcql.py +++ b/examples/eval/eval_bcql.py @@ -1,16 +1,16 @@ -from typing import Any, DefaultDict, Dict, List, Optional, Tuple from dataclasses import asdict, dataclass +from typing import Any, DefaultDict, Dict, List, Optional, Tuple -import gymnasium as gym # noqa import dsrl +import gymnasium as gym # noqa import numpy as np import pyrallis -from pyrallis import field import torch +from dsrl.offline_env import OfflineEnvWrapper, wrap_env # noqa +from pyrallis import field from osrl.algorithms import BCQL, BCQLTrainer -from dsrl.offline_env import OfflineEnvWrapper, wrap_env # noqa -from fsrl.utils.exp_util import load_config_and_model, seed_all +from osrl.common.exp_util import load_config_and_model, seed_all @dataclass diff --git a/examples/eval/eval_bearl.py b/examples/eval/eval_bearl.py index c12f9cd..147e76e 100644 --- a/examples/eval/eval_bearl.py +++ b/examples/eval/eval_bearl.py @@ -1,16 +1,16 @@ -from typing import Any, DefaultDict, Dict, List, Optional, Tuple from dataclasses import asdict, dataclass +from typing import Any, DefaultDict, Dict, List, Optional, Tuple -import gymnasium as gym # noqa import dsrl +import gymnasium as gym # noqa import numpy as np import pyrallis -from pyrallis import field import torch +from dsrl.offline_env import OfflineEnvWrapper, wrap_env # noqa +from pyrallis import field from osrl.algorithms import BEARL, BEARLTrainer -from dsrl.offline_env import OfflineEnvWrapper, wrap_env # noqa -from fsrl.utils.exp_util import load_config_and_model, seed_all +from osrl.common.exp_util import load_config_and_model, seed_all @dataclass diff --git a/examples/eval/eval_cdt.py b/examples/eval/eval_cdt.py index 1645b77..e1b9ef8 100644 --- a/examples/eval/eval_cdt.py +++ b/examples/eval/eval_cdt.py @@ -1,16 +1,16 @@ -from typing import Any, DefaultDict, Dict, List, Optional, Tuple from dataclasses import asdict, dataclass +from typing import Any, DefaultDict, Dict, List, Optional, Tuple -import gymnasium as gym # noqa import dsrl +import gymnasium as gym # noqa import numpy as np import pyrallis -from pyrallis import field import torch +from dsrl.offline_env import OfflineEnvWrapper, wrap_env # noqa +from pyrallis import field from osrl.algorithms import CDT, CDTTrainer -from dsrl.offline_env import OfflineEnvWrapper, wrap_env # noqa -from fsrl.utils.exp_util import load_config_and_model, seed_all +from osrl.common.exp_util import load_config_and_model, seed_all @dataclass diff --git a/examples/eval/eval_coptidice.py b/examples/eval/eval_coptidice.py index 3fce3c3..9ee8a56 100644 --- a/examples/eval/eval_coptidice.py +++ b/examples/eval/eval_coptidice.py @@ -1,16 +1,16 @@ -from typing import Any, DefaultDict, Dict, List, Optional, Tuple from dataclasses import asdict, dataclass +from typing import Any, DefaultDict, Dict, List, Optional, Tuple -import gymnasium as gym # noqa import dsrl +import gymnasium as gym # noqa import numpy as np import pyrallis -from pyrallis import field import torch +from dsrl.offline_env import OfflineEnvWrapper, wrap_env # noqa +from pyrallis import field from osrl.algorithms import COptiDICE, COptiDICETrainer -from dsrl.offline_env import OfflineEnvWrapper, wrap_env # noqa -from fsrl.utils.exp_util import load_config_and_model, seed_all +from osrl.common.exp_util import load_config_and_model, seed_all @dataclass diff --git a/examples/eval/eval_cpq.py b/examples/eval/eval_cpq.py index 9f83ff7..0d4b344 100644 --- a/examples/eval/eval_cpq.py +++ b/examples/eval/eval_cpq.py @@ -1,16 +1,16 @@ -from typing import Any, DefaultDict, Dict, List, Optional, Tuple from dataclasses import asdict, dataclass +from typing import Any, DefaultDict, Dict, List, Optional, Tuple -import gymnasium as gym # noqa import dsrl +import gymnasium as gym # noqa import numpy as np import pyrallis -from pyrallis import field import torch +from dsrl.offline_env import OfflineEnvWrapper, wrap_env # noqa +from pyrallis import field from osrl.algorithms import CPQ, CPQTrainer -from dsrl.offline_env import OfflineEnvWrapper, wrap_env # noqa -from fsrl.utils.exp_util import load_config_and_model, seed_all +from osrl.common.exp_util import load_config_and_model, seed_all @dataclass diff --git a/examples/train/train_bc.py b/examples/train/train_bc.py index 5b074a5..14f18d6 100644 --- a/examples/train/train_bc.py +++ b/examples/train/train_bc.py @@ -1,25 +1,25 @@ -from typing import Any, DefaultDict, Dict, List, Optional, Tuple -from dataclasses import asdict, dataclass import os import uuid +from dataclasses import asdict, dataclass +from typing import Any, DefaultDict, Dict, List, Optional, Tuple -import gymnasium as gym # noqa import bullet_safety_gym # noqa import dsrl +import gymnasium as gym # noqa import numpy as np import pyrallis import torch +from dsrl.infos import DENSITY_CFG +from dsrl.offline_env import OfflineEnvWrapper, wrap_env # noqa +from fsrl.utils import WandbLogger from torch.utils.data import DataLoader from tqdm.auto import trange # noqa -from dsrl.infos import DEFAULT_MAX_EPISODE_STEPS, DENSITY_CFG -from dsrl.offline_env import OfflineEnvWrapper, wrap_env # noqa -from fsrl.utils import WandbLogger, DummyLogger +from examples.configs.bc_configs import BC_DEFAULT_CONFIG, BCTrainConfig +from osrl.algorithms import BC, BCTrainer from osrl.common import TransitionDataset from osrl.common.dataset import process_bc_dataset -from osrl.algorithms import BC, BCTrainer -from fsrl.utils.exp_util import auto_name, seed_all -from examples.configs.bc_configs import BCTrainConfig, BC_DEFAULT_CONFIG +from osrl.common.exp_util import auto_name, seed_all @pyrallis.wrap() @@ -29,7 +29,6 @@ def train(args: BCTrainConfig): torch.set_num_threads(args.threads) # setup logger - args.episode_len = DEFAULT_MAX_EPISODE_STEPS[args.task.split("-")[0][len("Offline"):][:-len("Gymnasium")]] cfg = asdict(args) default_cfg = asdict(BC_DEFAULT_CONFIG[args.task]()) if args.name is None: @@ -39,40 +38,34 @@ def train(args: BCTrainConfig): args.group = args.task + "-cost-" + str(int(args.cost_limit)) if args.logdir is not None: args.logdir = os.path.join(args.logdir, args.group, args.name) - # logger = WandbLogger(cfg, args.project, args.group, args.name, args.logdir) + logger = WandbLogger(cfg, args.project, args.group, args.name, args.logdir) # # logger = TensorboardLogger(args.logdir, log_txt=True, name=args.name) - # logger.save_config(cfg, verbose=args.verbose) - logger = DummyLogger() + logger.save_config(cfg, verbose=args.verbose) # the cost scale is down in trainer rollout env = gym.make(args.task) data = env.get_dataset() env.set_target_cost(args.cost_limit) + cbins, rbins, max_npb, min_npb = None, None, None, None if args.density != 1.0: - density_cfg = DENSITY_CFG[args.task+"_density"+str(args.density)] + density_cfg = DENSITY_CFG[args.task + "_density" + str(args.density)] cbins = density_cfg["cbins"] rbins = density_cfg["rbins"] max_npb = density_cfg["max_npb"] min_npb = density_cfg["min_npb"] - data = env.pre_process_data(data, args.outliers_percent, args.noise_scale, - args.inpaint_ranges, args.epsilon, args.density, - cbins=cbins, rbins=rbins, max_npb=max_npb, min_npb=min_npb) - - # function w.r.t episode cost - frontier_fn = {} - frontier_fn["OfflineAntCircle-v0"] = lambda x: 600 + 4 * x - frontier_fn["OfflineAntRun-v0"] = lambda x: 600 + 10 / 3 * x - frontier_fn["OfflineCarCircle-v0"] = lambda x: 450 + 5 / 3 * x - frontier_fn["OfflineCarRun-v0"] = lambda x: 600 - frontier_fn["OfflineDroneRun-v0"] = lambda x: 325 + 125 / 70 * x - frontier_fn["OfflineDroneCircle-v0"] = lambda x: 600 + 4 * x - frontier_range = 50 - - process_bc_dataset(data, args.cost_limit, args.gamma, args.bc_mode, - # frontier_fn[args.task], - None, - frontier_range) + data = env.pre_process_data(data, + args.outliers_percent, + args.noise_scale, + args.inpaint_ranges, + args.epsilon, + args.density, + cbins=cbins, + rbins=rbins, + max_npb=max_npb, + min_npb=min_npb) + + process_bc_dataset(data, args.cost_limit, args.gamma, args.bc_mode) # model & optimizer & scheduler setup state_dim = env.observation_space.shape[0] diff --git a/examples/train/train_bcql.py b/examples/train/train_bcql.py index 286ee41..12fecfc 100644 --- a/examples/train/train_bcql.py +++ b/examples/train/train_bcql.py @@ -1,24 +1,24 @@ -from typing import Any, DefaultDict, Dict, List, Optional, Tuple -from dataclasses import asdict, dataclass import os import uuid +from dataclasses import asdict, dataclass +from typing import Any, DefaultDict, Dict, List, Optional, Tuple -import gymnasium as gym # noqa import bullet_safety_gym # noqa import dsrl +import gymnasium as gym # noqa import numpy as np import pyrallis import torch -from torch.utils.data import DataLoader -from tqdm.auto import trange # noqa -from dsrl.infos import DEFAULT_MAX_EPISODE_STEPS, DENSITY_CFG +from dsrl.infos import DENSITY_CFG from dsrl.offline_env import OfflineEnvWrapper, wrap_env # noqa from fsrl.utils import WandbLogger +from torch.utils.data import DataLoader +from tqdm.auto import trange # noqa -from osrl.common import TransitionDataset +from examples.configs.bcql_configs import BCQL_DEFAULT_CONFIG, BCQLTrainConfig from osrl.algorithms import BCQL, BCQLTrainer -from fsrl.utils.exp_util import auto_name, seed_all -from examples.configs.bcql_configs import BCQLTrainConfig, BCQL_DEFAULT_CONFIG +from osrl.common import TransitionDataset +from osrl.common.exp_util import auto_name, seed_all @pyrallis.wrap() @@ -28,7 +28,6 @@ def train(args: BCQLTrainConfig): torch.set_num_threads(args.threads) # setup logger - args.episode_len = DEFAULT_MAX_EPISODE_STEPS[args.task.split("-")[0][len("Offline"):][:-len("Gymnasium")]] cfg = asdict(args) default_cfg = asdict(BCQL_DEFAULT_CONFIG[args.task]()) if args.name is None: @@ -47,18 +46,24 @@ def train(args: BCQLTrainConfig): # pre-process offline dataset data = env.get_dataset() env.set_target_cost(args.cost_limit) - # data = env.pre_process_data(data, args.outliers_percent, args.noise_scale, - # args.inpaint_ranges, args.epsilon) + cbins, rbins, max_npb, min_npb = None, None, None, None if args.density != 1.0: - density_cfg = DENSITY_CFG[args.task+"_density"+str(args.density)] + density_cfg = DENSITY_CFG[args.task + "_density" + str(args.density)] cbins = density_cfg["cbins"] rbins = density_cfg["rbins"] max_npb = density_cfg["max_npb"] min_npb = density_cfg["min_npb"] - data = env.pre_process_data(data, args.outliers_percent, args.noise_scale, - args.inpaint_ranges, args.epsilon, args.density, - cbins=cbins, rbins=rbins, max_npb=max_npb, min_npb=min_npb) + data = env.pre_process_data(data, + args.outliers_percent, + args.noise_scale, + args.inpaint_ranges, + args.epsilon, + args.density, + cbins=cbins, + rbins=rbins, + max_npb=max_npb, + min_npb=min_npb) # wrapper env = wrap_env( diff --git a/examples/train/train_bearl.py b/examples/train/train_bearl.py index 97dd4f0..8970b08 100644 --- a/examples/train/train_bearl.py +++ b/examples/train/train_bearl.py @@ -1,24 +1,24 @@ -from typing import Any, DefaultDict, Dict, List, Optional, Tuple -from dataclasses import asdict, dataclass import os import uuid +from dataclasses import asdict, dataclass +from typing import Any, DefaultDict, Dict, List, Optional, Tuple -import gymnasium as gym # noqa import bullet_safety_gym # noqa import dsrl +import gymnasium as gym # noqa import numpy as np import pyrallis import torch -from torch.utils.data import DataLoader -from tqdm.auto import trange # noqa -from dsrl.infos import DEFAULT_MAX_EPISODE_STEPS, DENSITY_CFG +from dsrl.infos import DENSITY_CFG from dsrl.offline_env import OfflineEnvWrapper, wrap_env # noqa from fsrl.utils import WandbLogger +from torch.utils.data import DataLoader +from tqdm.auto import trange # noqa -from osrl.common import TransitionDataset +from examples.configs.bearl_configs import BEARL_DEFAULT_CONFIG, BEARLTrainConfig from osrl.algorithms import BEARL, BEARLTrainer -from fsrl.utils.exp_util import auto_name, seed_all -from examples.configs.bearl_configs import BEARLTrainConfig, BEARL_DEFAULT_CONFIG +from osrl.common import TransitionDataset +from osrl.common.exp_util import auto_name, seed_all @pyrallis.wrap() @@ -28,7 +28,6 @@ def train(args: BEARLTrainConfig): torch.set_num_threads(args.threads) # setup logger - args.episode_len = DEFAULT_MAX_EPISODE_STEPS[args.task.split("-")[0][len("Offline"):][:-len("Gymnasium")]] cfg = asdict(args) default_cfg = asdict(BEARL_DEFAULT_CONFIG[args.task]()) if args.name is None: @@ -47,18 +46,24 @@ def train(args: BEARLTrainConfig): # pre-process offline dataset data = env.get_dataset() env.set_target_cost(args.cost_limit) - # data = env.pre_process_data(data, args.outliers_percent, args.noise_scale, - # args.inpaint_ranges, args.epsilon) + cbins, rbins, max_npb, min_npb = None, None, None, None if args.density != 1.0: - density_cfg = DENSITY_CFG[args.task+"_density"+str(args.density)] + density_cfg = DENSITY_CFG[args.task + "_density" + str(args.density)] cbins = density_cfg["cbins"] rbins = density_cfg["rbins"] max_npb = density_cfg["max_npb"] min_npb = density_cfg["min_npb"] - data = env.pre_process_data(data, args.outliers_percent, args.noise_scale, - args.inpaint_ranges, args.epsilon, args.density, - cbins=cbins, rbins=rbins, max_npb=max_npb, min_npb=min_npb) + data = env.pre_process_data(data, + args.outliers_percent, + args.noise_scale, + args.inpaint_ranges, + args.epsilon, + args.density, + cbins=cbins, + rbins=rbins, + max_npb=max_npb, + min_npb=min_npb) # wrapper env = wrap_env( diff --git a/examples/train/train_cdt.py b/examples/train/train_cdt.py index 1a7a069..5b98cab 100644 --- a/examples/train/train_cdt.py +++ b/examples/train/train_cdt.py @@ -1,23 +1,24 @@ -from typing import Any, DefaultDict, Dict, List, Optional, Tuple -from dataclasses import asdict, dataclass import os import uuid +from dataclasses import asdict, dataclass +from typing import Any, DefaultDict, Dict, List, Optional, Tuple -import gymnasium as gym # noqa import bullet_safety_gym # noqa import dsrl +import gymnasium as gym # noqa import numpy as np import pyrallis import torch -from torch.utils.data import DataLoader -from tqdm.auto import trange # noqa +from dsrl.infos import DENSITY_CFG from dsrl.offline_env import OfflineEnvWrapper, wrap_env # noqa from fsrl.utils import WandbLogger +from torch.utils.data import DataLoader +from tqdm.auto import trange # noqa -from osrl.common import SequenceDataset +from examples.configs.cdt_configs import CDT_DEFAULT_CONFIG, CDTTrainConfig from osrl.algorithms import CDT, CDTTrainer -from fsrl.utils.exp_util import auto_name, seed_all -from examples.configs.cdt_configs import CDTTrainConfig, CDT_DEFAULT_CONFIG +from osrl.common import SequenceDataset +from osrl.common.exp_util import auto_name, seed_all @pyrallis.wrap() @@ -31,6 +32,8 @@ def train(args: CDTTrainConfig): default_cfg = asdict(CDT_DEFAULT_CONFIG[args.task]()) if args.name is None: args.name = auto_name(default_cfg, cfg, args.prefix, args.suffix) + if args.group is None: + args.group = args.task + "-cost-" + str(int(args.cost_limit)) if args.logdir is not None: args.logdir = os.path.join(args.logdir, args.group, args.name) logger = WandbLogger(cfg, args.project, args.group, args.name, args.logdir) @@ -43,8 +46,24 @@ def train(args: CDTTrainConfig): # pre-process offline dataset data = env.get_dataset() env.set_target_cost(args.cost_limit) - data = env.pre_process_data(data, args.outliers_percent, args.noise_scale, - args.inpaint_ranges, args.epsilon) + + cbins, rbins, max_npb, min_npb = None, None, None, None + if args.density != 1.0: + density_cfg = DENSITY_CFG[args.task + "_density" + str(args.density)] + cbins = density_cfg["cbins"] + rbins = density_cfg["rbins"] + max_npb = density_cfg["max_npb"] + min_npb = density_cfg["min_npb"] + data = env.pre_process_data(data, + args.outliers_percent, + args.noise_scale, + args.inpaint_ranges, + args.epsilon, + args.density, + cbins=cbins, + rbins=rbins, + max_npb=max_npb, + min_npb=min_npb) # wrapper env = wrap_env( diff --git a/examples/train/train_coptidice.py b/examples/train/train_coptidice.py index abdb7a2..ae51694 100644 --- a/examples/train/train_coptidice.py +++ b/examples/train/train_coptidice.py @@ -1,24 +1,25 @@ -from typing import Any, DefaultDict, Dict, List, Optional, Tuple -from dataclasses import asdict, dataclass import os import uuid +from dataclasses import asdict, dataclass +from typing import Any, DefaultDict, Dict, List, Optional, Tuple -import gymnasium as gym # noqa import bullet_safety_gym # noqa import dsrl +import gymnasium as gym # noqa import numpy as np import pyrallis import torch -from torch.utils.data import DataLoader -from tqdm.auto import trange # noqa -from dsrl.infos import DEFAULT_MAX_EPISODE_STEPS, DENSITY_CFG +from dsrl.infos import DENSITY_CFG from dsrl.offline_env import OfflineEnvWrapper, wrap_env # noqa from fsrl.utils import WandbLogger +from torch.utils.data import DataLoader +from tqdm.auto import trange # noqa -from osrl.common import TransitionDataset +from examples.configs.coptidice_configs import (COptiDICE_DEFAULT_CONFIG, + COptiDICETrainConfig) from osrl.algorithms import COptiDICE, COptiDICETrainer -from fsrl.utils.exp_util import auto_name, seed_all -from examples.configs.coptidice_configs import COptiDICETrainConfig, COptiDICE_DEFAULT_CONFIG +from osrl.common import TransitionDataset +from osrl.common.exp_util import auto_name, seed_all @pyrallis.wrap() @@ -28,7 +29,6 @@ def train(args: COptiDICETrainConfig): torch.set_num_threads(args.threads) # setup logger - args.episode_len = DEFAULT_MAX_EPISODE_STEPS[args.task.split("-")[0][len("Offline"):][:-len("Gymnasium")]] cfg = asdict(args) default_cfg = asdict(COptiDICE_DEFAULT_CONFIG[args.task]()) if args.name is None: @@ -47,18 +47,24 @@ def train(args: COptiDICETrainConfig): # pre-process offline dataset data = env.get_dataset() env.set_target_cost(args.cost_limit) - # data = env.pre_process_data(data, args.outliers_percent, args.noise_scale, - # args.inpaint_ranges, args.epsilon) + cbins, rbins, max_npb, min_npb = None, None, None, None if args.density != 1.0: - density_cfg = DENSITY_CFG[args.task+"_density"+str(args.density)] + density_cfg = DENSITY_CFG[args.task + "_density" + str(args.density)] cbins = density_cfg["cbins"] rbins = density_cfg["rbins"] max_npb = density_cfg["max_npb"] min_npb = density_cfg["min_npb"] - data = env.pre_process_data(data, args.outliers_percent, args.noise_scale, - args.inpaint_ranges, args.epsilon, args.density, - cbins=cbins, rbins=rbins, max_npb=max_npb, min_npb=min_npb) + data = env.pre_process_data(data, + args.outliers_percent, + args.noise_scale, + args.inpaint_ranges, + args.epsilon, + args.density, + cbins=cbins, + rbins=rbins, + max_npb=max_npb, + min_npb=min_npb) # wrapper env = wrap_env( diff --git a/examples/train/train_cpq.py b/examples/train/train_cpq.py index 6df2c9d..97862b9 100644 --- a/examples/train/train_cpq.py +++ b/examples/train/train_cpq.py @@ -1,24 +1,24 @@ -from typing import Any, DefaultDict, Dict, List, Optional, Tuple -from dataclasses import asdict, dataclass import os import uuid +from dataclasses import asdict, dataclass +from typing import Any, DefaultDict, Dict, List, Optional, Tuple -import gymnasium as gym # noqa import bullet_safety_gym # noqa import dsrl +import gymnasium as gym # noqa import numpy as np import pyrallis import torch -from torch.utils.data import DataLoader -from tqdm.auto import trange # noqa -from dsrl.infos import DEFAULT_MAX_EPISODE_STEPS, DENSITY_CFG +from dsrl.infos import DENSITY_CFG from dsrl.offline_env import OfflineEnvWrapper, wrap_env # noqa from fsrl.utils import WandbLogger +from torch.utils.data import DataLoader +from tqdm.auto import trange # noqa -from osrl.common import TransitionDataset +from examples.configs.cpq_configs import CPQ_DEFAULT_CONFIG, CPQTrainConfig from osrl.algorithms import CPQ, CPQTrainer -from fsrl.utils.exp_util import auto_name, seed_all -from examples.configs.cpq_configs import CPQTrainConfig, CPQ_DEFAULT_CONFIG +from osrl.common import TransitionDataset +from osrl.common.exp_util import auto_name, seed_all @pyrallis.wrap() @@ -28,7 +28,6 @@ def train(args: CPQTrainConfig): torch.set_num_threads(args.threads) # setup logger - args.episode_len = DEFAULT_MAX_EPISODE_STEPS[args.task.split("-")[0][len("Offline"):][:-len("Gymnasium")]] cfg = asdict(args) default_cfg = asdict(CPQ_DEFAULT_CONFIG[args.task]()) if args.name is None: @@ -47,18 +46,24 @@ def train(args: CPQTrainConfig): # pre-process offline dataset data = env.get_dataset() env.set_target_cost(args.cost_limit) - # data = env.pre_process_data(data, args.outliers_percent, args.noise_scale, - # args.inpaint_ranges, args.epsilon) + cbins, rbins, max_npb, min_npb = None, None, None, None if args.density != 1.0: - density_cfg = DENSITY_CFG[args.task+"_density"+str(args.density)] + density_cfg = DENSITY_CFG[args.task + "_density" + str(args.density)] cbins = density_cfg["cbins"] rbins = density_cfg["rbins"] max_npb = density_cfg["max_npb"] min_npb = density_cfg["min_npb"] - data = env.pre_process_data(data, args.outliers_percent, args.noise_scale, - args.inpaint_ranges, args.epsilon, args.density, - cbins=cbins, rbins=rbins, max_npb=max_npb, min_npb=min_npb) + data = env.pre_process_data(data, + args.outliers_percent, + args.noise_scale, + args.inpaint_ranges, + args.epsilon, + args.density, + cbins=cbins, + rbins=rbins, + max_npb=max_npb, + min_npb=min_npb) # wrapper env = wrap_env( diff --git a/examples/train_all_tasks.py b/examples/train_all_tasks.py index fbe7ee8..499e7f2 100644 --- a/examples/train_all_tasks.py +++ b/examples/train_all_tasks.py @@ -1,21 +1,60 @@ -from fsrl.utils.exp_util import ExperimentGrid +from easy_runner import EasyRunner if __name__ == "__main__": exp_name = "benchmark" - runner = ExperimentGrid(log_name=exp_name) + runner = EasyRunner(log_name=exp_name) task = [ - "offline-AntCircle-v0", "offline-AntRun-v0", "offline-CarCircle-v0", - "offline-DroneCircle-v0", "offline-DroneRun-v0" + # bullet safety gym envs + "OfflineAntCircle-v0", + "OfflineAntRun-v0", + "OfflineCarCircle-v0", + "OfflineDroneCircle-v0", + "OfflineDroneRun-v0", + "OfflineBallCircle-v0", + "OfflineBallRun-v0", + "OfflineCarRun-v0", + # safety gymnasium: car + "OfflineCarButton1Gymnasium-v0", + "OfflineCarButton2Gymnasium-v0", + "OfflineCarCircle1Gymnasium-v0", + "OfflineCarCircle2Gymnasium-v0", + "OfflineCarGoal1Gymnasium-v0", + "OfflineCarGoal2Gymnasium-v0", + "OfflineCarPush1Gymnasium-v0", + "OfflineCarPush2Gymnasium-v0", + # safety gymnasium: point + "OfflinePointButton1Gymnasium-v0", + "OfflinePointButton2Gymnasium-v0", + "OfflinePointCircle1Gymnasium-v0", + "OfflinePointCircle2Gymnasium-v0", + "OfflinePointGoal1Gymnasium-v0", + "OfflinePointGoal2Gymnasium-v0", + "OfflinePointPush1Gymnasium-v0", + "OfflinePointPush2Gymnasium-v0", + # safety gymnasium: velocity + "OfflineAntVelocityGymnasium-v1", + "OfflineHalfCheetahVelocityGymnasium-v1", + "OfflineHopperVelocityGymnasium-v1", + "OfflineSwimmerVelocityGymnasium-v1", + "OfflineWalker2dVelocityGymnasium-v1", + # metadrive envs + "OfflineMetadrive-easysparse-v0", + "OfflineMetadrive-easymean-v0", + "OfflineMetadrive-easydense-v0", + "OfflineMetadrive-mediumsparse-v0", + "OfflineMetadrive-mediummean-v0", + "OfflineMetadrive-mediumdense-v0", + "OfflineMetadrive-hardsparse-v0", + "OfflineMetadrive-hardmean-v0", + "OfflineMetadrive-harddense-v0", ] - # outliers_percent = [0.05, 0.1, 0.15] - # noise_scale = [0.05, 0.1, 0.15] + policy = ["train_bc", "train_bcql", "train_bearl", "train_coptidice", "train_cpq"] - # seed = [0, 10, 20] # Do not write & to the end of the command, it will be added automatically. template = "nohup python examples/train/{}.py --task {} --device cpu" train_instructions = runner.compose(template, [policy, task]) - runner.run(train_instructions, max_parallel=15) + runner.start(train_instructions, max_parallel=15) diff --git a/osrl/__init__.py b/osrl/__init__.py index 6876063..c0802f3 100644 --- a/osrl/__init__.py +++ b/osrl/__init__.py @@ -1,7 +1,6 @@ __version__ = "0.1.0" - __all__ = [ "algorithms", "common", -] \ No newline at end of file +] diff --git a/osrl/algorithms/bc.py b/osrl/algorithms/bc.py index 370c6e6..aa903af 100644 --- a/osrl/algorithms/bc.py +++ b/osrl/algorithms/bc.py @@ -1,16 +1,11 @@ -from dataclasses import asdict, dataclass -from copy import deepcopy -from typing import Any, DefaultDict, Dict, List, Optional, Tuple - import gymnasium as gym -import dsrl import numpy as np -from tqdm.auto import tqdm, trange # noqa - import torch import torch.nn as nn import torch.nn.functional as F -from fsrl.utils import WandbLogger, DummyLogger +from fsrl.utils import DummyLogger, WandbLogger +from tqdm.auto import trange # noqa + from osrl.common.net import MLPActor diff --git a/osrl/algorithms/bcql.py b/osrl/algorithms/bcql.py index fa55674..4a256df 100644 --- a/osrl/algorithms/bcql.py +++ b/osrl/algorithms/bcql.py @@ -1,20 +1,14 @@ -from dataclasses import asdict, dataclass from copy import deepcopy -from typing import Any, DefaultDict, Dict, List, Optional, Tuple -import os -import uuid import gymnasium as gym -import dsrl -import pyrallis import numpy as np -from tqdm.auto import tqdm, trange # noqa - import torch import torch.nn as nn -from fsrl.utils import WandbLogger, DummyLogger -from osrl.common.net import MLPGaussianPerturbationActor, \ - EnsembleDoubleQCritic, VAE, LagrangianPIDController +from fsrl.utils import DummyLogger, WandbLogger +from tqdm.auto import trange # noqa + +from osrl.common.net import (VAE, EnsembleDoubleQCritic, LagrangianPIDController, + MLPGaussianPerturbationActor) class BCQL(nn.Module): diff --git a/osrl/algorithms/bearl.py b/osrl/algorithms/bearl.py index df87d6c..52b60f2 100644 --- a/osrl/algorithms/bearl.py +++ b/osrl/algorithms/bearl.py @@ -1,18 +1,14 @@ -from typing import Any, DefaultDict, Dict, List, Optional, Tuple, Union -from collections import defaultdict from copy import deepcopy -from dataclasses import asdict, dataclass import gymnasium as gym import numpy as np -from tqdm.auto import tqdm, trange # noqa - import torch import torch.nn as nn +from fsrl.utils import DummyLogger, WandbLogger +from tqdm.auto import trange # noqa -from fsrl.utils import WandbLogger, DummyLogger -from osrl.common.net import SquashedGaussianMLPActor, EnsembleDoubleQCritic, \ - VAE, LagrangianPIDController +from osrl.common.net import (VAE, EnsembleDoubleQCritic, LagrangianPIDController, + SquashedGaussianMLPActor) class BEARL(nn.Module): diff --git a/osrl/algorithms/cdt.py b/osrl/algorithms/cdt.py index 26db6a8..f384be1 100644 --- a/osrl/algorithms/cdt.py +++ b/osrl/algorithms/cdt.py @@ -1,19 +1,15 @@ -from typing import Any, DefaultDict, Dict, List, Optional, Tuple, Union -from collections import defaultdict -from dataclasses import asdict, dataclass +from typing import Optional, Tuple import gymnasium as gym import numpy as np -from tqdm.auto import tqdm, trange # noqa - import torch import torch.nn as nn -from torch.nn import functional as F # noqa -from torch import distributions as pyd +from fsrl.utils import DummyLogger, WandbLogger from torch.distributions.beta import Beta +from torch.nn import functional as F # noqa +from tqdm.auto import trange # noqa -from osrl.common.net import TransformerBlock, mlp, DiagGaussianActor -from fsrl.utils import WandbLogger, DummyLogger +from osrl.common.net import DiagGaussianActor, TransformerBlock, mlp class CDT(nn.Module): diff --git a/osrl/algorithms/coptidice.py b/osrl/algorithms/coptidice.py index 2877f60..ebc6442 100644 --- a/osrl/algorithms/coptidice.py +++ b/osrl/algorithms/coptidice.py @@ -1,19 +1,14 @@ -from typing import Any, DefaultDict, Dict, List, Optional, Tuple, Union -from collections import defaultdict -from dataclasses import asdict, dataclass - import gymnasium as gym import numpy as np -from tqdm.auto import tqdm, trange # noqa - import torch import torch.nn as nn -from torch.nn import functional as F # noqa +from fsrl.utils import DummyLogger, WandbLogger from torch import distributions as pyd from torch.distributions.beta import Beta +from torch.nn import functional as F # noqa +from tqdm.auto import trange # noqa -from fsrl.utils import WandbLogger, DummyLogger -from osrl.common.net import SquashedGaussianMLPActor, EnsembleQCritic +from osrl.common.net import EnsembleQCritic, SquashedGaussianMLPActor def get_f_div_fn(f_type: str): diff --git a/osrl/algorithms/cpq.py b/osrl/algorithms/cpq.py index edca45e..f1d42ca 100644 --- a/osrl/algorithms/cpq.py +++ b/osrl/algorithms/cpq.py @@ -1,19 +1,13 @@ -from dataclasses import asdict, dataclass from copy import deepcopy -from typing import Any, DefaultDict, Dict, List, Optional, Tuple -import os -import uuid import gymnasium as gym -import dsrl -import pyrallis import numpy as np -from tqdm.auto import tqdm, trange # noqa - import torch import torch.nn as nn -from fsrl.utils import WandbLogger, DummyLogger -from osrl.common.net import SquashedGaussianMLPActor, EnsembleQCritic, VAE +from fsrl.utils import DummyLogger, WandbLogger +from tqdm.auto import trange # noqa + +from osrl.common.net import VAE, EnsembleQCritic, SquashedGaussianMLPActor class CPQ(nn.Module): diff --git a/osrl/common/__init__.py b/osrl/common/__init__.py index 0acb487..182fee6 100644 --- a/osrl/common/__init__.py +++ b/osrl/common/__init__.py @@ -1,2 +1,3 @@ -from .dataset import SequenceDataset, TransitionDataset -from .net import * \ No newline at end of file +from osrl.common.dataset import SequenceDataset, TransitionDataset +from osrl.common.exp_util import * +from osrl.common.net import * diff --git a/osrl/common/dataset.py b/osrl/common/dataset.py index 1d2c470..a0e379b 100644 --- a/osrl/common/dataset.py +++ b/osrl/common/dataset.py @@ -1,15 +1,16 @@ -from typing import Any, DefaultDict, Dict, List, Optional, Tuple, Union -from collections import defaultdict -from dataclasses import asdict, dataclass -import random import copy import heapq +import random +from collections import Counter, defaultdict +from typing import Any, DefaultDict, Dict, List, Optional, Tuple, Union import numpy as np -import oapackage -from collections import Counter + +try: + import oapackage +except ImportError: + print("OApackage is not installed, can not use CDT.") from scipy.optimize import minimize -from random import sample from torch.nn import functional as F # noqa from torch.utils.data import IterableDataset from tqdm.auto import trange # noqa @@ -26,8 +27,7 @@ def discounted_cumsum(x: np.ndarray, gamma: float) -> np.ndarray: return cumsum -def process_bc_dataset(dataset: dict, cost_limit: float, gamma: float, bc_mode: str, - frontier_fn=None, frontier_range=None): +def process_bc_dataset(dataset: dict, cost_limit: float, gamma: float, bc_mode: str): """ Processes a givne dataset for behavior cloning and its variants. @@ -51,51 +51,87 @@ def process_bc_dataset(dataset: dict, cost_limit: float, gamma: float, bc_mode: dict: A dictionary containing the processed dataset. """ - + # get the indices of the transitions after terminal states or timeouts done_idx = np.where((dataset["terminals"] == 1) | (dataset["timeouts"] == 1))[0] - + n_transitions = dataset["observations"].shape[0] - selected_transition = np.zeros((n_transitions,), dtype=int) dataset["cost_returns"] = np.zeros_like(dataset["costs"]) - + dataset["rew_returns"] = np.zeros_like(dataset["rewards"]) + cost_ret, rew_ret = [], [] + pareto_frontier, pf_mask = None, None + + # compute episode returns for i in range(done_idx.shape[0]): - - start = 0 if i == 0 else done_idx[i-1] + 1 + start = 0 if i == 0 else done_idx[i - 1] + 1 end = done_idx[i] + 1 - # compute the cost and reward returns for the segment cost_returns = discounted_cumsum(dataset["costs"][start:end], gamma=gamma) reward_returns = discounted_cumsum(dataset["rewards"][start:end], gamma=gamma) dataset["cost_returns"][start:end] = cost_returns[0] - - # select the transitions for behavior cloning based on the mode - if bc_mode == "all" or bc_mode == "multi-task": - selected_transition[start:end] = 1 - elif bc_mode == "safe": - # safe trajectories - if cost_returns[0] <= cost_limit: - selected_transition[start:end] = 1 - elif bc_mode == "risky": - # high cost trajectories - if cost_returns[0] >= 2 * cost_limit: - selected_transition[start:end] = 1 - elif bc_mode == "frontier": - # trajectories that are near the Pareto frontier - if frontier_fn(cost_returns[0]) - frontier_range <= reward_returns[0] and \ - reward_returns[0] <= frontier_fn(cost_returns[0]) + frontier_range: - selected_transition[start:end] = 1 - elif bc_mode == "boundary": - # trajectories that are near the cost limit - if 0.5 * cost_limit < cost_returns[0] and cost_returns[0] <= 1.5 * cost_limit: - selected_transition[start:end] = 1 + dataset["rew_returns"][start:end] = reward_returns[0] + cost_ret.append(cost_returns[0]) + rew_ret.append(reward_returns[0]) + + # compute Pareto Frontier + if bc_mode == "frontier": + cost_ret = np.array(cost_ret, dtype=np.float64) + rew_ret = np.array(rew_ret, dtype=np.float64) + rmax, rmin = np.max(rew_ret), np.min(rew_ret) + + pareto = oapackage.ParetoDoubleLong() + for i in range(rew_ret.shape[0]): + w = oapackage.doubleVector((-cost_ret[i], rew_ret[i])) + pareto.addvalue(w, i) + pareto.show(verbose=1) + pareto_idx = list(pareto.allindices()) + cost_ret_pareto = cost_ret[pareto_idx] + rew_ret_pareto = rew_ret[pareto_idx] + + for deg in [0, 1, 2]: + pareto_frontier = np.poly1d( + np.polyfit(cost_ret_pareto, rew_ret_pareto, deg=deg)) + pf_rew_ret = pareto_frontier(cost_ret_pareto) + ss_total = np.sum((rew_ret_pareto - np.mean(rew_ret_pareto))**2) + ss_residual = np.sum((rew_ret_pareto - pf_rew_ret)**2) + r_squared = 1 - (ss_residual / ss_total) + if r_squared >= 0.9: + break + + pf_rew_ret = pareto_frontier(dataset["cost_returns"]) + pf_mask = np.logical_and( + pf_rew_ret - (rmax - rmin) / 5 <= dataset["rew_returns"], + dataset["rew_returns"] <= pf_rew_ret + (rmax - rmin) / 5) + + # select the transitions for behavior cloning based on the mode + selected_transition = np.zeros((n_transitions, ), dtype=int) + if bc_mode == "all" or bc_mode == "multi-task": + selected_transition = np.ones((n_transitions, ), dtype=int) + elif bc_mode == "safe": + # safe trajectories + selected_transition[dataset["cost_returns"] <= cost_limit] = 1 + elif bc_mode == "risky": + # high cost trajectories + selected_transition[dataset["cost_returns"] >= 2 * cost_limit] = 1 + elif bc_mode == "boundary": + # trajectories that are near the cost limit + mask = np.logical_and(0.5 * cost_limit < dataset["cost_returns"], + dataset["cost_returns"] <= 1.5 * cost_limit) + selected_transition[mask] = 1 + elif bc_mode == "frontier": + selected_transition[pf_mask] = 1 + else: + raise NotImplementedError for k, v in dataset.items(): dataset[k] = v[selected_transition == 1] if bc_mode == "multi-task": - dataset["observations"] = np.hstack((dataset["observations"], dataset["cost_returns"].reshape(-1, 1))) + dataset["observations"] = np.hstack( + (dataset["observations"], dataset["cost_returns"].reshape(-1, 1))) - print(f"original size = {n_transitions}, cost limit = {cost_limit}, filtered size = {np.sum(selected_transition == 1)}") + print( + f"original size = {n_transitions}, cost limit = {cost_limit}, filtered size = {np.sum(selected_transition == 1)}" + ) def process_sequence_dataset(dataset: dict, cost_reverse: bool = False): @@ -130,7 +166,8 @@ def process_sequence_dataset(dataset: dict, cost_reverse: bool = False): episode_data = {k: np.array(v, dtype=np.float32) for k, v in data_.items()} # return-to-go if gamma=1.0, just discounted returns else episode_data["returns"] = discounted_cumsum(episode_data["rewards"], gamma=1) - episode_data["cost_returns"] = discounted_cumsum(episode_data["costs"], gamma=1) + episode_data["cost_returns"] = discounted_cumsum(episode_data["costs"], + gamma=1) traj.append(episode_data) traj_len.append(episode_step) # reset trajectory buffer @@ -150,7 +187,6 @@ def get_nearest_point(original_data: np.ndarray, sampled_data: np.ndarray, max_rew_decrease: float = 1, beta: float = 1): - """ Given two arrays of data, finds the indices of the original data that are closest to each sample in the sampled data, and returns a list of those indices. @@ -188,17 +224,28 @@ def get_nearest_point(original_data: np.ndarray, mask = original_data[:, 0] <= p[0] # the associated data should be: 1) smaller than the current cost 2) greater than certain reward - mask = np.logical_and(original_data[:, 0] <= p[0], original_data[:, 1] >= p[1] - max_rew_decrease) + mask = np.logical_and(original_data[:, 0] <= p[0], + original_data[:, 1] >= p[1] - max_rew_decrease) delta = original_data[mask, :] - p dist = np.hypot(delta[:, 0], delta[:, 1]) dist = dist_fun(dist) - sample_idx = np.random.choice(dist.shape[0], size=num - 1, p=dist / np.sum(dist)) + sample_idx = np.random.choice(dist.shape[0], + size=num - 1, + p=dist / np.sum(dist)) new_idxes.extend(original_idx[mask][sample_idx.tolist()]) return new_idxes -def grid_filter(x, y, xmin=-np.inf, xmax=np.inf, ymin=-np.inf, ymax=np.inf, - xbins=10, ybins=10, max_num_per_bin=10, min_num_per_bin=1): +def grid_filter(x, + y, + xmin=-np.inf, + xmax=np.inf, + ymin=-np.inf, + ymax=np.inf, + xbins=10, + ybins=10, + max_num_per_bin=10, + min_num_per_bin=1): xmin, xmax = max(min(x), xmin), min(max(x), xmax) ymin, ymax = max(min(y), ymin), min(max(y), ymax) xbin_step = (xmax - xmin) / xbins @@ -216,7 +263,7 @@ def grid_filter(x, y, xmin=-np.inf, xmax=np.inf, ymin=-np.inf, ymax=np.inf, for v in bin_hashmap.values(): if len(v) > max_num_per_bin: # random sample max_num_per_bin indices - indices += sample(v, max_num_per_bin) + indices += random.sample(v, max_num_per_bin) elif len(v) <= min_num_per_bin: continue else: @@ -287,19 +334,23 @@ def augmentation(trajs: list, cost_ret.append(c) rew_ret = np.array(rew_ret, dtype=np.float64) cost_ret = np.array(cost_ret, dtype=np.float64) - + # grid filer to filter outliers - cmin, cmax = 0, 70 - rmin, rmax = 100, 1000 + cmin, cmax = np.min(cost_ret), np.max(cost_ret) + rmin, rmax = np.min(rew_ret), np.max(rew_ret) cbins, rbins = 10, 50 max_npb, min_npb = 10, 2 - cost_ret, rew_ret, trajs, indices = filter_trajectory( - cost_ret, rew_ret, trajs, - cost_min=cmin, cost_max=cmax, - rew_min=rmin, rew_max=rmax, - cost_bins=cbins, rew_bins=rbins, - max_num_per_bin=max_npb, - min_num_per_bin=min_npb) + cost_ret, rew_ret, trajs, indices = filter_trajectory(cost_ret, + rew_ret, + trajs, + cost_min=cmin, + cost_max=cmax, + rew_min=rmin, + rew_max=rmax, + cost_bins=cbins, + rew_bins=rbins, + max_num_per_bin=max_npb, + min_num_per_bin=min_npb) print(f"after filter {len(trajs)}") rew_ret = np.array(rew_ret, dtype=np.float64) cost_ret = np.array(cost_ret, dtype=np.float64) @@ -324,7 +375,9 @@ def augmentation(trajs: list, max_reward = max_reward * np.ones(pf_rew_ret.shape) min_reward = min_reward * np.ones(pf_rew_ret.shape) # sample the rewards that are above the pf curve and within the max_reward - sampled_rew_ret = np.random.uniform(low=pf_rew_ret + min_reward, high=max_reward, size=sample_num) + sampled_rew_ret = np.random.uniform(low=pf_rew_ret + min_reward, + high=max_reward, + size=sample_num) # associate each sampled (cost, reward) pair with a trajectory index original_data = np.hstack([cost_ret[:, None], rew_ret[:, None]]) @@ -442,7 +495,10 @@ def compute_start_index_sample_prob(dataset, prob=0.4): # some utils functionalities specific for Decision Transformer -def pad_along_axis(arr: np.ndarray, pad_to: int, axis: int = 0, fill_value: float = 0.0) -> np.ndarray: +def pad_along_axis(arr: np.ndarray, + pad_to: int, + axis: int = 0, + fill_value: float = 0.0) -> np.ndarray: pad_size = pad_to - arr.shape[axis] if pad_size <= 0: return arr @@ -542,7 +598,9 @@ def random_augmentation(trajs: list, cmin = np.min(cost_ret) num = int(augment_percent * cost_ret.shape[0]) - sampled_cr = np.random.uniform(low=(aug_cmin, aug_rmin), high=(aug_cmax, aug_rmax), size=(num, 2)) + sampled_cr = np.random.uniform(low=(aug_cmin, aug_rmin), + high=(aug_cmax, aug_rmax), + size=(num, 2)) idxes = [] original_data = np.hstack([cost_ret[:, None], rew_ret[:, None]]) @@ -564,8 +622,10 @@ def random_augmentation(trajs: list, target_cost_ret, target_rew_ret = target[0], target[1] associated_traj = copy.deepcopy(trajs[i]) cost_ret, rew_ret = associated_traj["cost_returns"], associated_traj["returns"] - cost_ret += target_cost_ret - cost_ret[0] + np.random.normal(loc=0, scale=cstd, size=cost_ret.shape) - rew_ret += target_rew_ret - rew_ret[0] + np.random.normal(loc=0, scale=rstd, size=rew_ret.shape) + cost_ret += target_cost_ret - cost_ret[0] + np.random.normal( + loc=0, scale=cstd, size=cost_ret.shape) + rew_ret += target_rew_ret - rew_ret[0] + np.random.normal( + loc=0, scale=rstd, size=rew_ret.shape) aug_trajs.append(associated_traj) return idxes, aug_trajs @@ -647,7 +707,8 @@ def __init__( print("*" * 100) print("Using pareto frontier data points only!!!!!") print("*" * 100) - self.dataset = select_optimal_trajectory(self.original_data, rmin, cost_bins, npb) + self.dataset = select_optimal_trajectory(self.original_data, rmin, cost_bins, + npb) elif random_aug > 0: self.idx, self.aug_data = random_augmentation( self.original_data, @@ -662,8 +723,9 @@ def __init__( ) elif augment_percent > 0: # sampled data and the index of its "nearest" point in the dataset - self.idx, self.aug_data, self.pareto_frontier, self.indices = augmentation(self.original_data, deg, max_rew_decrease, - beta, augment_percent, max_reward, min_reward) + self.idx, self.aug_data, self.pareto_frontier, self.indices = augmentation( + self.original_data, deg, max_rew_decrease, beta, augment_percent, + max_reward, min_reward) self.dataset = self.original_data + self.aug_data print( f"original data: {len(self.original_data)}, augment data: {len(self.aug_data)}, total: {len(self.dataset)}" @@ -678,7 +740,8 @@ def __init__( # compute every trajectories start index sampling prob: if start_sampling: - self.start_idx_sample_prob = compute_start_index_sample_prob(dataset=self.dataset, prob=prob) + self.start_idx_sample_prob = compute_start_index_sample_prob( + dataset=self.dataset, prob=prob) def compute_pareto_return(self, cost): return self.pareto_frontier(cost) @@ -699,7 +762,9 @@ def __prepare_sample(self, traj_idx, start_idx): returns = returns * self.reward_scale cost_returns = cost_returns * self.cost_scale # pad up to seq_len if needed - mask = np.hstack([np.ones(states.shape[0]), np.zeros(self.seq_len - states.shape[0])]) + mask = np.hstack( + [np.ones(states.shape[0]), + np.zeros(self.seq_len - states.shape[0])]) if states.shape[0] < self.seq_len: states = pad_along_axis(states, pad_to=self.seq_len) actions = pad_along_axis(actions, pad_to=self.seq_len) @@ -717,10 +782,11 @@ def __iter__(self): start_idx = np.random.choice(self.dataset[traj_idx]["rewards"].shape[0], p=self.start_idx_sample_prob[traj_idx]) else: - start_idx = random.randint(0, self.dataset[traj_idx]["rewards"].shape[0] - 1) + start_idx = random.randint( + 0, self.dataset[traj_idx]["rewards"].shape[0] - 1) yield self.__prepare_sample(traj_idx, start_idx) - - + + class TransitionDataset(IterableDataset): """ A dataset of transitions (state, action, reward, next state) used for training RL agents. @@ -733,6 +799,7 @@ class TransitionDataset(IterableDataset): corresponds to the initial state of an episode. """ + def __init__(self, dataset: dict, reward_scale: float = 1.0, @@ -744,8 +811,9 @@ def __init__(self, self.sample_prob = None self.state_init = state_init self.dataset_size = self.dataset["observations"].shape[0] - - self.dataset["done"] = np.logical_or(self.dataset["terminals"], self.dataset["timeouts"]).astype(np.float32) + + self.dataset["done"] = np.logical_or(self.dataset["terminals"], + self.dataset["timeouts"]).astype(np.float32) if self.state_init: self.dataset["is_init"] = self.dataset["done"].copy() self.dataset["is_init"][1:] = self.dataset["is_init"][:-1] diff --git a/osrl/common/exp_util.py b/osrl/common/exp_util.py new file mode 100644 index 0000000..aa877b7 --- /dev/null +++ b/osrl/common/exp_util.py @@ -0,0 +1,151 @@ +import os +import os.path as osp +import random +import uuid +from typing import Dict, Optional, Sequence + +import numpy as np +import torch +import yaml + + +def seed_all(seed=1029, others: Optional[list] = None): + random.seed(seed) + os.environ['PYTHONHASHSEED'] = str(seed) + np.random.seed(seed) + # torch.use_deterministic_algorithms(True) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) # if you are using multi-GPU. + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True + if others is not None: + if hasattr(others, "seed"): + others.seed(seed) + return True + try: + for item in others: + if hasattr(item, "seed"): + item.seed(seed) + except: + pass + + +def get_cfg_value(config, key): + if key in config: + value = config[key] + if isinstance(value, list): + suffix = "" + for i in value: + suffix += str(i) + return suffix + return str(value) + for k in config.keys(): + if isinstance(config[k], dict): + res = get_cfg_value(config[k], key) + if res is not None: + return res + return "None" + + +def load_config_and_model(path: str, best: bool = False): + ''' + Load the configuration and trained model from a specified directory. + + :param path: the directory path where the configuration and trained model are stored. + :param best: whether to load the best-performing model or the most recent one. Defaults to False. + + :return: a tuple containing the configuration dictionary and the trained model. + :raises ValueError: if the specified directory does not exist. + ''' + if osp.exists(path): + config_file = osp.join(path, "config.yaml") + print(f"load config from {config_file}") + with open(config_file) as f: + config = yaml.load(f.read(), Loader=yaml.FullLoader) + model_file = "model.pt" + if best: + model_file = "model_best.pt" + model_path = osp.join(path, "checkpoint/" + model_file) + print(f"load model from {model_path}") + model = torch.load(model_path) + return config, model + else: + raise ValueError(f"{path} doesn't exist!") + + +def to_string(values): + ''' + Recursively convert a sequence or dictionary of values to a string representation. + :param values: the sequence or dictionary of values to be converted to a string. + :return: a string representation of the input values. + ''' + name = "" + if isinstance(values, Sequence) and not isinstance(values, str): + for i, v in enumerate(values): + prefix = "" if i == 0 else "_" + name += prefix + to_string(v) + return name + elif isinstance(values, Dict): + for i, k in enumerate(sorted(values.keys())): + prefix = "" if i == 0 else "_" + name += prefix + to_string(values[k]) + return name + else: + return str(values) + + +DEFAULT_SKIP_KEY = [ + "task", "reward_threshold", "logdir", "worker", "project", "group", "name", "prefix", + "suffix", "save_interval", "render", "verbose", "save_ckpt", "training_num", + "testing_num", "epoch", "device", "thread" +] + +DEFAULT_KEY_ABBRE = { + "cost_limit": "cost", + "mstep_iter_num": "mnum", + "estep_iter_num": "enum", + "estep_kl": "ekl", + "mstep_kl_mu": "kl_mu", + "mstep_kl_std": "kl_std", + "mstep_dual_lr": "mlr", + "estep_dual_lr": "elr", + "update_per_step": "update" +} + + +def auto_name(default_cfg: dict, + current_cfg: dict, + prefix: str = "", + suffix: str = "", + skip_keys: list = DEFAULT_SKIP_KEY, + key_abbre: dict = DEFAULT_KEY_ABBRE) -> str: + ''' + Automatic generate the experiment name by comparing the current config with the default one. + + :param dict default_cfg: a dictionary containing the default configuration values. + :param dict current_cfg: a dictionary containing the current configuration values. + :param str prefix: (optional) a string to be added at the beginning of the generated name. + :param str suffix: (optional) a string to be added at the end of the generated name. + :param list skip_keys: (optional) a list of keys to be skipped when generating the name. + :param dict key_abbre: (optional) a dictionary containing abbreviations for keys in the generated name. + + :return str: a string representing the generated experiment name. + ''' + name = prefix + for i, k in enumerate(sorted(default_cfg.keys())): + if default_cfg[k] == current_cfg[k] or k in skip_keys: + continue + prefix = "_" if len(name) else "" + value = to_string(current_cfg[k]) + # replace the name with abbreviation if key has abbreviation in key_abbre + if k in key_abbre: + k = key_abbre[k] + # Add the key-value pair to the name variable with the prefix + name += prefix + k + value + if len(suffix): + name = name + "_" + suffix if len(name) else suffix + + name = "default" if not len(name) else name + name = f"{name}-{str(uuid.uuid4())[:4]}" + return name diff --git a/osrl/common/net.py b/osrl/common/net.py index 9d31688..f267d24 100644 --- a/osrl/common/net.py +++ b/osrl/common/net.py @@ -1,13 +1,12 @@ -from typing import Any, DefaultDict, Dict, List, Optional, Tuple, Union +import math +from typing import Optional +import numpy as np import torch import torch.nn as nn import torch.nn.functional as F -from torch.distributions.normal import Normal from torch import distributions as pyd - -import numpy as np -import math +from torch.distributions.normal import Normal def mlp(sizes, activation, output_activation=nn.Identity): @@ -44,7 +43,13 @@ class MLPGaussianPerturbationActor(nn.Module): act_limit (float): The absolute value of the limits of the action space. """ - def __init__(self, obs_dim, act_dim, hidden_sizes, activation, phi=0.05, act_limit=1): + def __init__(self, + obs_dim, + act_dim, + hidden_sizes, + activation, + phi=0.05, + act_limit=1): super().__init__() pi_sizes = [obs_dim + act_dim] + list(hidden_sizes) + [act_dim] self.pi = mlp(pi_sizes, activation, nn.Tanh) @@ -68,6 +73,7 @@ class MLPActor(nn.Module): activation (Type[nn.Module]): The activation function to use between layers. act_limit (float, optional): The upper limit of the action space. """ + def __init__(self, obs_dim, act_dim, hidden_sizes, activation, act_limit=1): super().__init__() pi_sizes = [obs_dim] + list(hidden_sizes) + [act_dim] @@ -92,16 +98,23 @@ class MLPGaussianActor(nn.Module): activation (Type[nn.Module]): The activation function to use between layers. device (str): The device to use for computation (cpu or cuda). """ - def __init__(self, obs_dim, act_dim, action_low, action_high, hidden_sizes, - activation, device="cpu"): + + def __init__(self, + obs_dim, + act_dim, + action_low, + action_high, + hidden_sizes, + activation, + device="cpu"): super().__init__() self.device = device - self.action_low = torch.nn.Parameter( - torch.tensor(action_low, device=device)[None, ...], - requires_grad=False) # (1, act_dim) - self.action_high = torch.nn.Parameter( - torch.tensor(action_high, device=device)[None, ...], - requires_grad=False) # (1, act_dim) + self.action_low = torch.nn.Parameter(torch.tensor(action_low, + device=device)[None, ...], + requires_grad=False) # (1, act_dim) + self.action_high = torch.nn.Parameter(torch.tensor(action_high, + device=device)[None, ...], + requires_grad=False) # (1, act_dim) log_std = -0.5 * np.ones(act_dim, dtype=np.float32) self.log_std = torch.nn.Parameter(torch.as_tensor(log_std)) self.mu_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation) @@ -134,6 +147,8 @@ def forward(self, obs, act=None, deterministic=False): LOG_STD_MAX = 2 LOG_STD_MIN = -20 + + class SquashedGaussianMLPActor(nn.Module): ''' A MLP Gaussian actor, can also be used as a deterministic actor @@ -174,14 +189,15 @@ def forward(self, if with_logprob: # Compute logprob from Gaussian, and then apply correction for Tanh squashing. logp_pi = pi_distribution.log_prob(pi_action).sum(axis=-1) - logp_pi -= (2 * (np.log(2) - pi_action - F.softplus(-2 * pi_action))).sum(axis=1) + logp_pi -= (2 * + (np.log(2) - pi_action - F.softplus(-2 * pi_action))).sum(axis=1) else: logp_pi = None # for BEARL only if return_pretanh_value: return torch.tanh(pi_action), pi_action - + pi_action = torch.tanh(pi_action) if with_distribution: @@ -283,6 +299,7 @@ class VAE(nn.Module): act_lim (float): The upper limit of the action space. device (str): The device to use for computation (cpu or cuda). """ + def __init__(self, obs_dim, act_dim, hidden_size, latent_dim, act_lim, device="cpu"): super(VAE, self).__init__() self.e1 = nn.Linear(obs_dim + act_dim, hidden_size) @@ -314,8 +331,9 @@ def forward(self, obs, act): def decode(self, obs, z=None): if z is None: - z = torch.randn((obs.shape[0], self.latent_dim)).clamp(-0.5, 0.5).to(self.device) - + z = torch.randn((obs.shape[0], self.latent_dim)).clamp(-0.5, + 0.5).to(self.device) + a = F.relu(self.d1(torch.cat([obs, z], 1))) a = F.relu(self.d2(a)) return self.act_lim * torch.tanh(self.d3(a)) @@ -323,9 +341,14 @@ def decode(self, obs, z=None): # for BEARL only def decode_multiple(self, obs, z=None, num_decode=10): if z is None: - z = torch.randn((obs.shape[0], num_decode, self.latent_dim)).clamp(-0.5, 0.5).to(self.device) - - a = F.relu(self.d1(torch.cat([obs.unsqueeze(0).repeat(num_decode, 1, 1).permute(1, 0, 2), z], 2))) + z = torch.randn( + (obs.shape[0], num_decode, self.latent_dim)).clamp(-0.5, + 0.5).to(self.device) + + a = F.relu( + self.d1( + torch.cat( + [obs.unsqueeze(0).repeat(num_decode, 1, 1).permute(1, 0, 2), z], 2))) a = F.relu(self.d2(a)) return torch.tanh(self.d3(a)), self.d3(a) @@ -360,7 +383,7 @@ def control(self, qc): self.error_old = error_new multiplier = F.relu(self.KP * F.relu(error_new) + self.KI * self.error_integral + - self.KD * error_diff) + self.KD * error_diff) return torch.mean(multiplier) @@ -380,7 +403,10 @@ def __init__( self.norm2 = nn.LayerNorm(embedding_dim) self.drop = nn.Dropout(residual_dropout) - self.attention = nn.MultiheadAttention(embedding_dim, num_heads, attention_dropout, batch_first=True) + self.attention = nn.MultiheadAttention(embedding_dim, + num_heads, + attention_dropout, + batch_first=True) self.mlp = nn.Sequential( nn.Linear(embedding_dim, 4 * embedding_dim), nn.GELU(), @@ -388,11 +414,14 @@ def __init__( nn.Dropout(residual_dropout), ) # True value indicates that the corresponding position is not allowed to attend - self.register_buffer("causal_mask", ~torch.tril(torch.ones(seq_len, seq_len)).to(bool)) + self.register_buffer("causal_mask", + ~torch.tril(torch.ones(seq_len, seq_len)).to(bool)) self.seq_len = seq_len # [batch_size, seq_len, emb_dim] -> [batch_size, seq_len, emb_dim] - def forward(self, x: torch.Tensor, padding_mask: Optional[torch.Tensor] = None) -> torch.Tensor: + def forward(self, + x: torch.Tensor, + padding_mask: Optional[torch.Tensor] = None) -> torch.Tensor: causal_mask = self.causal_mask[:x.shape[1], :x.shape[1]] norm_x = self.norm1(x) @@ -410,8 +439,8 @@ def forward(self, x: torch.Tensor, padding_mask: Optional[torch.Tensor] = None) x = x + self.drop(attention_out) x = x + self.mlp(self.norm2(x)) return x - - + + class TanhTransform(pyd.transforms.Transform): domain = pyd.constraints.real codomain = pyd.constraints.interval(-1.0, 1.0) diff --git a/setup.py b/setup.py index e63936a..08e54e6 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import os -import sys from setuptools import find_packages, setup @@ -15,67 +14,61 @@ def get_version() -> str: def get_install_requires() -> str: return [ - "tianshou~=0.4.10", - "gym>=0.23.1", + "dsrl", + "fast-safe-rl", "pyrallis==0.3.1", "pyyaml~=6.0", + "scipy~=1.10.1", "tqdm", "numpy>1.16.0", # https://github.com/numpy/numpy/issues/12793 "tensorboard>=2.5.0", - "torch>=1.4.0", + "torch~=1.13.0", "numba>=0.51.0", "wandb~=0.14.0", - "h5py>=2.10.0", # to match tensorflow's minimal requirements + "h5py>=2.10.0", "protobuf~=3.19.0", # breaking change, sphinx fail - "OApackage" + "python-dateutil==2.8.2", + "easy_runner", + "swig==4.1.1", ] def get_extras_require() -> str: req = { "dev": [ - "sphinx<4", - "sphinx_rtd_theme", - "jinja2<3.1", # temporary fix - "sphinxcontrib-bibtex", + "sphinx==6.2.1", + "sphinx_rtd_theme==1.2.0", + "jinja2==3.0.3", # temporary fix + "sphinxcontrib-bibtex==2.5.0", "flake8", "flake8-bugbear", "yapf", "isort", - "pytest", - "pytest-cov", - "ray>=1.0.0", - "wandb>=0.12.0", + "pytest~=7.3.1", + "pytest-cov~=4.0.0", "networkx", "mypy", "pydocstyle", - "doc8", + "doc8==0.11.2", "scipy", - "pillow", - "pettingzoo>=1.17", - "pygame>=2.1.0", # pettingzoo test cases pistonball - "pymunk>=6.2.1", # pettingzoo test cases pistonball - "nni>=2.3,<3.0", # expect breaking changes at next major version - "pytorch_lightning", - ], - "mujoco": ["mujoco_py"], - "pybullet": ["pybullet"], + "pre-commit", + ] } return req setup( - name="osrl", + name="osrl-lib", version=get_version(), description= - "A Modularized Implementation of Offline Safe Reinforcement Learning Algorithms", + "Elegant Implementations of Offline Safe Reinforcement Learning Algorithms", long_description=open("README.md", encoding="utf8").read(), long_description_content_type="text/markdown", url="https://github.com/liuzuxin/offline-safe-rl-baselines.git", author="Zijian Guo; Zuxin Liu", author_email="zuxin1997@gmail.com", license="MIT", - python_requires=">=3.6", + python_requires=">=3.8", classifiers=[ # How mature is this project? Common values are # 3 - Alpha @@ -90,15 +83,13 @@ def get_extras_require() -> str: "License :: OSI Approved :: MIT License", # Specify the Python versions you support here. In particular, ensure # that you indicate whether you support Python 2, Python 3 or both. - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", ], - keywords="reinforcement learning platform pytorch", + keywords="offline safe reinforcement learning algorithms pytorch", packages=find_packages( exclude=["test", "test.*", "examples", "examples.*", "docs", "docs.*"]), install_requires=get_install_requires(), extras_require=get_extras_require(), -) \ No newline at end of file +)