diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 97bbab3f..397f3266 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -2,7 +2,7 @@ # Each line is a file pattern followed by one or more owners. # These owners will be the default owners for everything in the repo. -* @CherryPieSexy @DT6A @Howuhh @Scitator @vkurenkov +* @Howuhh @Scitator @vkurenkov # Order is important. The last matching pattern has the most precedence. # So if a pull request only touches javascript files, only these owners diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 2e1832e8..573ca5be 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -1,45 +1,25 @@ -name: codestyle -# <- standard block end -> +name: codestyle check on: push: branches: - main pull_request: branches: - - dev - - develop - main - jobs: - build: - name: codestyle - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - max-parallel: 4 - matrix: - os: [ubuntu-20.04] - python-version: [3.8] - timeout-minutes: 30 + runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - - name: set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 + - uses: actions/checkout@v3 + - name: Set up Python 3.9 + uses: actions/setup-python@v4 with: - python-version: ${{ matrix.python-version }} - - - name: install dependencies + python-version: "3.9" + - name: Install dependencies run: | - # python -m pip install --upgrade --user pip + python -m pip install --upgrade pip pip install -r requirements/requirements_dev.txt - python --version - pip --version - pip list - shell: bash -# <- standard block end -> - name: check codestyle run: | - catalyst-check-codestyle --line-length 89 + ruff --config pyproject.toml --diff . \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e124e675..5c7a4c45 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,13 +1,6 @@ repos: - - repo: https://github.com/catalyst-team/codestyle - rev: 'v21.09.2' + - repo: https://github.com/charliermarsh/ruff-pre-commit + rev: 'v0.0.278' hooks: - - id: catalyst-make-codestyle - args: [--line-length=89] - - repo: https://github.com/catalyst-team/codestyle - rev: 'v21.09.2' - hooks: - - id: catalyst-check-codestyle - args: [--line-length=89] - -exclude: __init__.py + - id: ruff + args: [--fix] \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..05a5063c --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,104 @@ +# CORL Contribution Guidelines + +We welcome: + +- Bug reports +- Pull requests for bug fixes +- Logs and documentation improvements +- New algorithms and datasets +- Better hyperparameters (but with proofs) + +## Contributing to the codebase + +Contributing code is done through standard github methods: + +```commandline +git clone git@github.com:tinkoff-ai/CORL.git +cd CORL +pip install -r requirements/requirements_dev.txt +``` + +1. Fork this repo +2. Make a change and commit your code +3. Submit a pull request. It will be reviewed by maintainers, and they'll give feedback or make requests as applicable + +### Code style + +The CI will run several checks on the new code pushed to the CORL repository. +These checks can also be run locally without waiting for the CI by following the steps below: +1. [install `pre-commit`](https://pre-commit.com/#install), +2. install the Git hooks by running `pre-commit install`. + +Once those two steps are done, the Git hooks will be run automatically at every new commit. +The Git hooks can also be run manually with `pre-commit run --all-files`, and +if needed they can be skipped (not recommended) with `git commit --no-verify`. + +We use [Ruff](https://github.com/astral-sh/ruff) as our main linter. If you want to see possible +problems before pre-commit, you can run `ruff check --diff .` to see exact linter suggestions and future fixes. + +## Adding new algorithms + +All new algorithms should go to the `algorithms/contrib/offline` for just +offline algorithms and to the `algorithms/contrib/finetune` for the offline-to-online algorithms. +We as a team try to keep the core as reliable and reproducible as possible, +but we may not have the resources to support all future algorithms. +Therefore, this separation is necessary, as we cannot guarantee that all +algorithms from `algorithms/contrib` exactly reproduce the results of their original publications. + +Make sure your new code is properly documented and all references to the original implementations and papers are present (for example as in [Decision Transformer](algorithms/offline/dt.py)). +Please, *explain all the tricks and possible differences from the original implementation in as much detail as possible*. +Keep in mind that this code may be used by other researchers. Make their lives easier! + +### Considerations +While we welcome any algorithms, it is better to open an issue with the proposal before +so we can discuss the details. Unfortunately, not all algorithms are equally +easy to understand and reproduce. We may be able to give a couple of advices to you, +or on the contrary warn you that this particular algorithm will require too much +computational resources to fully reproduce the results, and it is better to do something else. + +### Running benchmarks + +Although you will have to do a hyperparameter search while reproducing the algorithm, +in the end we expect to see final configs in `configs/contrib///.yaml` with the best hyperparameters for all +datasets considered. The configs should be in `yaml` format, containing all hyperparameters sorted +in alphabetical order (see existing configs for an inspiration). + +Use these conventions to name your runs in the configs: +1. `name: ` +2. `group: --multiseed-v0`, increment version if needed +3. use our [\_\_post_init\_\_](https://github.com/tinkoff-ai/CORL/blob/962688b405f579a1ce6ec1b57e6369aaf76f9e69/algorithms/offline/awac.py#L48) implementation in your config dataclass + +Since we are releasing wandb logs for all algorithms, you will need to submit multiseed (~4 seeds) +training runs the `CORL` project in the wandb [corl-team](https://wandb.ai/corl-team) organization. We'll invite you there when the time will come. + +We usually use wandb sweeps for this. You can use this example config (it will work with pyrallis as it expects `config_path` cli argument): +```yaml +# sweep_config.yaml +entity: corl-team +project: CORL +program: algorithms/contrib/.py +method: grid +parameters: + config_path: + # algo_type is offline or finetune (see sections above) + values: [ + "configs/contrib///.yaml", + "configs/contrib///.yaml", + "configs/contrib///.yaml", + ] + train_seed: + values: [0, 1, 2, 3] +``` +Then proceed as usual. Create wandb sweep with `wandb sweep sweep_config.yaml`, then run agents with `wandb agent `. + +Based on the results, you will need to make wandb reports to make it easier for other users to understand. +You can use any of the already existing ones as an example (see [README.md](README.md)). + +### Checklist + +- [ ] Issue about new algorithm is open +- [ ] Single-file implementation is added to the `algorithms/contrib` +- [ ] PR has passed all the tests +- [ ] Evidence that implementation reproduces original results is provided +- [ ] Configs with the best hyperparameters for all datasets are added to the `configs/contrib` +- [ ] Logs and reports for best hyperparameters are submitted to our wandb organization diff --git a/README.md b/README.md index b3e49e9d..fe9723bd 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,9 @@ # CORL (Clean Offline Reinforcement Learning) +[![Twitter](https://badgen.net/badge/icon/twitter?icon=twitter&label)](https://twitter.com/vladkurenkov/status/1669361090550177793) +[![arXiv](https://img.shields.io/badge/arXiv-2210.07105-b31b1b.svg)](https://arxiv.org/abs/2210.07105) [](https://github.com/tinkoff-ai/CORL/blob/main/LICENSE) -[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) -[![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/) +[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/charliermarsh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) 🧵 CORL is an Offline Reinforcement Learning library that provides high-quality and easy-to-follow single-file implementations of SOTA ORL algorithms. Each implementation is backed by a research-friendly codebase, allowing you to run or tune thousands of experiments. Heavily inspired by [cleanrl](https://github.com/vwxyzjn/cleanrl) for online RL, check them out too!
@@ -11,6 +12,10 @@ * 📈 Benchmarked Implementation for N algorithms * 🖼 [Weights and Biases](https://wandb.ai/site) integration +---- +* ⭐ If you're interested in __discrete control__, make sure to check out our new library — [Katakomba](https://github.com/tinkoff-ai/katakomba). It provides both discrete control algorithms augmented with recurrence and an offline RL benchmark for the NetHack Learning environment. +---- + ## Getting started @@ -26,90 +31,180 @@ docker run --gpus=all -it --rm --name ## Algorithms Implemented -| Algorithm | Variants Implemented | Wandb Report | -|---------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------| ----------- | -| ✅ Behavioral Cloning
(BC) | [`any_percent_bc.py`](algorithms/offline/any_percent_bc.py) | [`Gym-MuJoCo, Maze2D`](https://wandb.ai/tlab/CORL/reports/BC-D4RL-Results--VmlldzoyNzA2MjE1) -| ✅ Behavioral Cloning-10%
(BC-10%) | [`any_percent_bc.py`](algorithms/offline/any_percent_bc.py) | [`Gym-MuJoCo, Maze2D`](https://wandb.ai/tlab/CORL/reports/BC-10-D4RL-Results--VmlldzoyNzEwMjcx) -| ✅ [Conservative Q-Learning for Offline Reinforcement Learning
(CQL)](https://arxiv.org/abs/2006.04779) | [`cql.py`](algorithms/offline/cql.py) | [`Gym-MuJoCo, Maze2D`](https://wandb.ai/tlab/CORL/reports/CQL-D4RL-Results--VmlldzoyNzA2MTk5) -| ✅ [Accelerating Online Reinforcement Learning with Offline Datasets
(AWAC)](https://arxiv.org/abs/2006.09359) | [`awac.py`](algorithms/offline/awac.py) | [`Gym-MuJoCo, Maze2D`](https://wandb.ai/tlab/CORL/reports/AWAC-D4RL-Results--VmlldzoyNzA2MjE3) -| ✅ [Offline Reinforcement Learning with Implicit Q-Learning
(IQL)](https://arxiv.org/abs/2110.06169) | [`iql.py`](algorithms/offline/iql.py) | [`Gym-MuJoCo, Maze2D`](https://wandb.ai/tlab/CORL/reports/IQL-D4RL-Results--VmlldzoyNzA2MTkx) -| ✅ [A Minimalist Approach to Offline Reinforcement Learning
(TD3+BC)](https://arxiv.org/abs/2106.06860) | [`td3_bc.py`](algorithms/offline/td3_bc.py) | [`Gym-MuJoCo, Maze2D`](https://wandb.ai/tlab/CORL/reports/TD3-BC-D4RL-Results--VmlldzoyNzA2MjA0) -| ✅ [Decision Transformer: Reinforcement Learning via Sequence Modeling
(DT)](https://arxiv.org/abs/2106.01345) | [`dt.py`](algorithms/offline/dt.py) | [`Gym-MuJoCo, Maze2D`](https://wandb.ai/tlab/CORL/reports/DT-D4RL-Results--VmlldzoyNzA2MTk3) -| ✅ [Uncertainty-Based Offline Reinforcement Learning with Diversified Q-Ensemble
(SAC-N)](https://arxiv.org/abs/2110.01548) | [`sac_n.py`](algorithms/offline/sac_n.py) | [`Gym-MuJoCo, Maze2D`](https://wandb.ai/tlab/CORL/reports/SAC-N-D4RL-Results--VmlldzoyNzA1NTY1) -| ✅ [Uncertainty-Based Offline Reinforcement Learning with Diversified Q-Ensemble
(EDAC)](https://arxiv.org/abs/2110.01548) | [`edac.py`](algorithms/offline/edac.py) | [`Gym-MuJoCo, Maze2D`](https://wandb.ai/tlab/CORL/reports/EDAC-D4RL-Results--VmlldzoyNzA5ODUw) -| ✅ [Q-Ensemble for Offline RL: Don't Scale the Ensemble, Scale the Batch Size
(LB-SAC)](https://arxiv.org/abs/2211.11092) | [`lb_sac.py`](algorithms/offline/lb_sac.py) | [`Gym-MuJoCo`](https://wandb.ai/tlab/CORL/reports/LB-SAC-D4RL-Results--VmlldzozNjIxMDY1) +| Algorithm | Variants Implemented | Wandb Report | +|--------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------| ----------- | +| **Offline and Offline-to-Online** | | +| ✅ [Conservative Q-Learning for Offline Reinforcement Learning
(CQL)](https://arxiv.org/abs/2006.04779) | [`offline/cql.py`](algorithms/offline/cql.py)
[`finetune/cql.py`](algorithms/finetune/cql.py) | [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-CQL--VmlldzoyNzA2MTk5)

[`Offline-to-online`](https://wandb.ai/tlab/CORL/reports/-Offline-to-Online-CQL--Vmlldzo0NTQ3NTMz) +| ✅ [Accelerating Online Reinforcement Learning with Offline Datasets
(AWAC)](https://arxiv.org/abs/2006.09359) | [`offline/awac.py`](algorithms/offline/awac.py)
[`finetune/awac.py`](algorithms/finetune/awac.py) | [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-AWAC--VmlldzoyNzA2MjE3)

[`Offline-to-online`](https://wandb.ai/tlab/CORL/reports/-Offline-to-Online-AWAC--VmlldzozODAyNzQz) +| ✅ [Offline Reinforcement Learning with Implicit Q-Learning
(IQL)](https://arxiv.org/abs/2110.06169) | [`offline/iql.py`](algorithms/offline/iql.py)
[`finetune/iql.py`](algorithms/finetune/iql.py) | [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-IQL--VmlldzoyNzA2MTkx)

[`Offline-to-online`](https://wandb.ai/tlab/CORL/reports/-Offline-to-Online-IQL--VmlldzozNzE1MTEy) +| **Offline-to-Online only** | | +| ✅ [Supported Policy Optimization for Offline Reinforcement Learning
(SPOT)](https://arxiv.org/abs/2202.06239) | [`finetune/spot.py`](algorithms/finetune/spot.py) | [`Offline-to-online`](https://wandb.ai/tlab/CORL/reports/-Offline-to-Online-SPOT--VmlldzozODk5MTgx) +| ✅ [Cal-QL: Calibrated Offline RL Pre-Training for Efficient Online Fine-Tuning
(Cal-QL)](https://arxiv.org/abs/2303.05479) | [`finetune/cal_ql.py`](algorithms/finetune/cal_ql.py) | [`Offline-to-online`](https://wandb.ai/tlab/CORL/reports/-Offline-to-Online-Cal-QL--Vmlldzo0NTQ3NDk5) +| **Offline only** | | +| ✅ Behavioral Cloning
(BC) | [`offline/any_percent_bc.py`](algorithms/offline/any_percent_bc.py) | [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-BC--VmlldzoyNzA2MjE1) +| ✅ Behavioral Cloning-10%
(BC-10%) | [`offline/any_percent_bc.py`](algorithms/offline/any_percent_bc.py) | [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-BC-10---VmlldzoyNzEwMjcx) +| ✅ [A Minimalist Approach to Offline Reinforcement Learning
(TD3+BC)](https://arxiv.org/abs/2106.06860) | [`offline/td3_bc.py`](algorithms/offline/td3_bc.py) | [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-TD3-BC--VmlldzoyNzA2MjA0) +| ✅ [Decision Transformer: Reinforcement Learning via Sequence Modeling
(DT)](https://arxiv.org/abs/2106.01345) | [`offline/dt.py`](algorithms/offline/dt.py) | [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-Decision-Transformer--VmlldzoyNzA2MTk3) +| ✅ [Uncertainty-Based Offline Reinforcement Learning with Diversified Q-Ensemble
(SAC-N)](https://arxiv.org/abs/2110.01548) | [`offline/sac_n.py`](algorithms/offline/sac_n.py) | [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-SAC-N--VmlldzoyNzA1NTY1) +| ✅ [Uncertainty-Based Offline Reinforcement Learning with Diversified Q-Ensemble
(EDAC)](https://arxiv.org/abs/2110.01548) | [`offline/edac.py`](algorithms/offline/edac.py) | [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-EDAC--VmlldzoyNzA5ODUw) +| ✅ [Revisiting the Minimalist Approach to Offline Reinforcement Learning
(ReBRAC)](https://arxiv.org/abs/2305.09836) | [`offline/rebrac.py`](algorithms/offline/rebrac.py) | [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-ReBRAC--Vmlldzo0ODkzOTQ2) +| ✅ [Q-Ensemble for Offline RL: Don't Scale the Ensemble, Scale the Batch Size
(LB-SAC)](https://arxiv.org/abs/2211.11092) | [`offline/lb_sac.py`](algorithms/offline/lb_sac.py) | [`Offline Gym-MuJoCo`](https://wandb.ai/tlab/CORL/reports/LB-SAC-D4RL-Results--VmlldzozNjIxMDY1) + ## D4RL Benchmarks -For learning curves and all the details, you can check the links above. Here, we report reproduced **final** and **best** scores. Note that thay differ by a big margin, and some papers may use different approaches not making it always explicit which one reporting methodology they chose. - -### Last Scores -#### Gym-MuJoCo -| **Task-Name**|BC|BC-10%|TD3 + BC|CQL|IQL|AWAC|SAC-N|EDAC|DT |LB-SAC | -|------------------------------|------------|--------|--------|-----|-----|------|-------|------|----|------| -|halfcheetah-medium-v2 | 42.40±0.21 | 42.46±0.81 | 48.10±0.21 | 47.08±0.19 | 48.31±0.11 | 50.01±0.30 | 68.20±1.48 | 67.70±1.20 | 42.20±0.30| 71.21±1.35| -|halfcheetah-medium-expert-v2 | 55.95±8.49 | 90.10±2.83 | 90.78±6.98 | 95.98±0.83 | 94.55±0.21 | 95.29±0.91 | 98.96±10.74 | 104.76±0.74 | 91.55±1.10| 106.57±3.90| -|halfcheetah-medium-replay-v2 | 35.66±2.68 | 23.59±8.02 | 44.84±0.68 | 45.19±0.58 | 43.53±0.43 | 44.91±1.30 | 60.70±1.17 | 62.06±1.27 | 38.91±0.57| 64.10±0.82| -|hopper-medium-v2 | 53.51±2.03 | 55.48±8.43 | 60.37±4.03 | 64.98±6.12 | 62.75±6.02 | 63.69±4.29 | 40.82±11.44 | 101.70±0.32 | 65.10±1.86| 103.75±0.07| -|hopper-medium-expert-v2 | 52.30±4.63 | 111.16±1.19 | 101.17±10.48 | 93.89±14.34 | 106.24±6.09 | 105.29±7.19 | 101.31±13.43 | 105.19±11.64 | 110.44±0.39| 110.93±0.51| -|hopper-medium-replay-v2 | 29.81±2.39 | 70.42±9.99 | 64.42±24.84 | 87.67±14.42 | 84.57±13.49 | 98.15±2.85 | 100.33±0.90 | 99.66±0.94 | 81.77±7.93| 102.53±0.92| -|walker2d-medium-v2 | 63.23±18.76 | 67.34±5.97 | 82.71±5.51 | 80.38±3.45 | 84.03±5.42 | 69.39±31.97 | 87.47±0.76 | 93.36±1.60 | 67.63±2.93| 90.95±0.65| -|walker2d-medium-expert-v2 | 98.96±18.45 | 108.70±0.29 | 110.03±0.41 | 109.68±0.52 | 111.68±0.56 | 111.16±2.41 | 114.93±0.48 | 114.75±0.86 | 107.11±1.11| 113.46±2.31| -|walker2d-medium-replay-v2 | 21.80±11.72 | 54.35±7.32 | 85.62±4.63 | 79.24±4.97 | 82.55±8.00 | 71.73±13.98 | 78.99±0.58 | 87.10±3.21 | 59.86±3.15| 87.95±1.43| -| | | | | | | | | | | | -| **locomotion average** |50.40 | 69.29 | 76.45 | 78.23 | 79.80 | 78.85 | 83.52 | 92.92 | 73.84| 94.60| - -#### Maze2d -| **Task-Name**|BC|BC-10%|TD3 + BC|CQL|IQL|AWAC|SAC-N|EDAC|DT | -|------------------------------|------------|--------|--------|-----|-----|------|-------|------|----| -|maze2d-umaze-v1 | 0.36±10.03 | 12.18±4.95 | 29.41±14.22 | -14.83±0.47 | 37.69±1.99 | 68.30±25.72 | 130.59±19.08 | 95.26±7.37 | 18.08±29.35| -|maze2d-medium-v1 | 0.79±3.76 | 14.25±2.69 | 59.45±41.86 | 86.62±11.11 | 35.45±0.98 | 82.66±46.71 | 88.61±21.62 | 57.04±3.98 | 31.71±30.40| -|maze2d-large-v1 | 2.26±5.07 | 11.32±5.88 | 97.10±29.34 | 33.22±43.66 | 49.64±22.02 | 218.87±3.96 | 204.76±1.37 | 95.60±26.46 | 35.66±32.56| -| | | | | | | | | | | -| **maze2d average** | 1.13 | 12.58 | 61.99 | 35.00 | 40.92 | 123.28 | 141.32 | 82.64 | 28.48| - -#### Antmaze -| **Task-Name**|BC|BC-10%|TD3 + BC|CQL|IQL|AWAC|SAC-N|EDAC|DT | -|------------------------------|------------|--------|--------|-----|-----|------|-------|------|----| -|antmaze-umaze-v0 | 51.50±8.81 | 67.75±6.40 | 93.25±1.50 | 72.75±5.32 | 74.50±11.03 | 63.50±9.33 | 0.00±0.00 | 29.25±33.35 | 51.75±11.76| -|antmaze-medium-play-v0 | 0.00±0.00 | 2.50±1.91 | 0.00±0.00 | 0.00±0.00 | 71.50±12.56 | 0.00±0.00 | 0.00±0.00 | 0.00±0.00 | 0.00±0.00| -|antmaze-large-play-v0 | 0.00±0.00 | 0.00±0.00 | 0.00±0.00 | 0.00±0.00 | 40.75±12.69 | 0.00±0.00 | 0.00±0.00 | 0.00±0.00 | 0.00±0.00| -| | | | | | | | | | | -| **antmaze average** | 17.17 | 23.42 | 31.08 | 24.25 | 62.25 | 21.17 | 0.00 | 9.75 | 17.25 | - -### Best Scores -#### Gym-MuJoCo -| **Task-Name**|BC|BC-10%|TD3 + BC|CQL|IQL|AWAC|SAC-N|EDAC|DT | LB-SAC| -|------------------------------|------------|--------|--------|-----|-----|------|-------|------|----|----| -|halfcheetah-medium-v2 | 43.60±0.16 | 43.90±0.15 | 48.93±0.13 | 47.45±0.10 | 48.77±0.06 | 50.87±0.21 | 72.21±0.35 | 69.72±1.06 | 42.73±0.11| 71.82±0.68| -|halfcheetah-medium-expert-v2 | 79.69±3.58 | 94.11±0.25 | 96.59±1.01 | 96.74±0.14 | 95.83±0.38 | 96.87±0.31 | 111.73±0.55 | 110.62±1.20 | 93.40±0.25| 110.37±0.47| -|halfcheetah-medium-replay-v2 | 40.52±0.22 | 42.27±0.53 | 45.84±0.30 | 46.38±0.14 | 45.06±0.16 | 46.57±0.27 | 67.29±0.39 | 66.55±1.21 | 40.31±0.32| 66.14±1.06| -|hopper-medium-v2 | 69.04±3.35 | 73.84±0.43 | 70.44±1.37 | 77.47±6.00 | 80.74±1.27 | 99.40±1.12 | 101.79±0.23 | 103.26±0.16 | 69.42±4.21| 103.88±0.17| -|hopper-medium-expert-v2 | 90.63±12.68 | 113.13±0.19 | 113.22±0.50 | 112.74±0.07 | 111.79±0.47 | 113.37±0.63 | 111.24±0.17 | 111.80±0.13 | 111.18±0.24| 110.93±0.51| -|hopper-medium-replay-v2 | 68.88±11.93 | 90.57±2.38 | 98.12±1.34 | 102.20±0.38 | 102.33±0.44 | 101.76±0.43 | 103.83±0.61 | 103.28±0.57 | 88.74±3.49| 104.00±0.94| -|walker2d-medium-v2 | 80.64±1.06 | 82.05±1.08 | 86.91±0.32 | 84.57±0.15 | 87.99±0.83 | 86.22±4.58 | 90.17±0.63 | 95.78±1.23 | 74.70±0.64| 90.95±0.65| -|walker2d-medium-expert-v2 | 109.95±0.72 | 109.90±0.10 | 112.21±0.07 | 111.63±0.20 | 113.19±0.33 | 113.40±2.57 | 116.93±0.49 | 116.52±0.86 | 108.71±0.39| 113.46±2.31| -|walker2d-medium-replay-v2 | 48.41±8.78 | 76.09±0.47 | 91.17±0.83 | 89.34±0.59 | 91.85±2.26 | 87.06±0.93 | 85.18±1.89 | 89.69±1.60 | 68.22±1.39| 92.25±2.20| -| | | | | | | | | | | | -| **locomotion average** | 70.15 | 80.65 | 84.83 | 85.39 | 86.40 | 88.39 | 95.60 | 96.36 | 77.49 | 95.97| - - -#### Maze2d -| **Task-Name**|BC|BC-10%|TD3 + BC|CQL|IQL|AWAC|SAC-N|EDAC|DT | -|------------------------------|------------|--------|--------|-----|-----|------|-------|------|----| -|maze2d-umaze-v1 | 16.09±1.00 | 22.49±1.75 | 99.33±18.66 | 84.92±34.40 | 44.04±3.02 | 141.92±12.88 | 153.12±7.50 | 149.88±2.27 | 63.83±20.04| -|maze2d-medium-v1 | 19.16±1.44 | 27.64±2.16 | 150.93±4.50 | 137.52±9.83 | 92.25±40.74 | 160.95±11.64 | 93.80±16.93 | 154.41±1.82 | 68.14±14.15| -|maze2d-large-v1 | 20.75±7.69 | 41.83±4.20 | 197.64±6.07 | 153.29±12.86 | 138.70±44.70 | 228.00±2.06 | 207.51±1.11 | 182.52±3.10 | 50.25±22.33| -| | | | | | | | | | | -| **maze2d average** | 18.67 | 30.65 | 149.30 | 125.25 | 91.66 | 176.96 | 151.48 | 162.27 | 60.74 | - -#### Antmaze -| **Task-Name**|BC|BC-10%|TD3 + BC|CQL|IQL|AWAC|SAC-N|EDAC|DT | -|------------------------------|------------|--------|--------|-----|-----|------|-------|------|----| -|antmaze-umaze-v0 | 71.25±9.07 | 79.50±2.38 | 97.75±1.50 | 85.00±3.56 | 87.00±2.94 | 74.75±8.77 | 0.00±0.00 | 75.00±27.51 | 60.50±3.11| -|antmaze-medium-play-v0 | 4.75±2.22 | 8.50±3.51 | 6.00±2.00 | 3.00±0.82 | 86.00±2.16 | 14.00±11.80 | 0.00±0.00 | 0.00±0.00 | 0.25±0.50| -|antmaze-large-play-v0 | 0.75±0.50 | 11.75±2.22 | 0.50±0.58 | 0.50±0.58 | 53.00±6.83 | 0.00±0.00 | 0.00±0.00 | 0.00±0.00 | 0.00±0.00| -| | | | | | | | | | | -| **antmaze average** | 25.58 | 33.25 | 34.75 | 29.50 | 75.33 | 29.58 | 0.00 | 25.00 | 20.25 | +You can check the links above for learning curves and details. Here, we report reproduced **final** and **best** scores. Note that they differ by a significant margin, and some papers may use different approaches, not making it always explicit which reporting methodology they chose. If you want to re-collect our results in a more structured/nuanced manner, see [`results`](results). + +### Offline +#### Last Scores +##### Gym-MuJoCo + +| **Task-Name**|BC|10% BC|TD3+BC|AWAC|CQL|IQL|ReBRAC|SAC-N|EDAC|DT| +|------------------------------|------------|--------|--------|--------|-----|-----|------|-------|------|----| +|halfcheetah-medium-v2|42.40 ± 0.19|42.46 ± 0.70|48.10 ± 0.18|49.46 ± 0.62|47.04 ± 0.22|48.31 ± 0.22|64.04 ± 0.68|68.20 ± 1.28|67.70 ± 1.04|42.20 ± 0.26| +|halfcheetah-medium-replay-v2|35.66 ± 2.33|23.59 ± 6.95|44.84 ± 0.59|44.70 ± 0.69|45.04 ± 0.27|44.46 ± 0.22|51.18 ± 0.31|60.70 ± 1.01|62.06 ± 1.10|38.91 ± 0.50| +|halfcheetah-medium-expert-v2|55.95 ± 7.35|90.10 ± 2.45|90.78 ± 6.04|93.62 ± 0.41|95.63 ± 0.42|94.74 ± 0.52|103.80 ± 2.95|98.96 ± 9.31|104.76 ± 0.64|91.55 ± 0.95| +|hopper-medium-v2|53.51 ± 1.76|55.48 ± 7.30|60.37 ± 3.49|74.45 ± 9.14|59.08 ± 3.77|67.53 ± 3.78|102.29 ± 0.17|40.82 ± 9.91|101.70 ± 0.28|65.10 ± 1.61| +|hopper-medium-replay-v2|29.81 ± 2.07|70.42 ± 8.66|64.42 ± 21.52|96.39 ± 5.28|95.11 ± 5.27|97.43 ± 6.39|94.98 ± 6.53|100.33 ± 0.78|99.66 ± 0.81|81.77 ± 6.87| +|hopper-medium-expert-v2|52.30 ± 4.01|111.16 ± 1.03|101.17 ± 9.07|52.73 ± 37.47|99.26 ± 10.91|107.42 ± 7.80|109.45 ± 2.34|101.31 ± 11.63|105.19 ± 10.08|110.44 ± 0.33| +|walker2d-medium-v2|63.23 ± 16.24|67.34 ± 5.17|82.71 ± 4.78|66.53 ± 26.04|80.75 ± 3.28|80.91 ± 3.17|85.82 ± 0.77|87.47 ± 0.66|93.36 ± 1.38|67.63 ± 2.54| +|walker2d-medium-replay-v2|21.80 ± 10.15|54.35 ± 6.34|85.62 ± 4.01|82.20 ± 1.05|73.09 ± 13.22|82.15 ± 3.03|84.25 ± 2.25|78.99 ± 0.50|87.10 ± 2.78|59.86 ± 2.73| +|walker2d-medium-expert-v2|98.96 ± 15.98|108.70 ± 0.25|110.03 ± 0.36|49.41 ± 38.16|109.56 ± 0.39|111.72 ± 0.86|111.86 ± 0.43|114.93 ± 0.41|114.75 ± 0.74|107.11 ± 0.96| +| | | | | | | | | | | | +| **locomotion average** |50.40|69.29|76.45|67.72|78.28|81.63|89.74|83.52|92.92|73.84| + +##### Maze2d +| **Task-Name** |BC|10% BC|TD3+BC|AWAC|CQL|IQL|ReBRAC|SAC-N|EDAC|DT| +|--------------------|------------|--------|--------|--------|-----|-----|------|-------|------|----| +| maze2d-umaze-v1 |0.36 ± 8.69|12.18 ± 4.29|29.41 ± 12.31|82.67 ± 28.30|-8.90 ± 6.11|42.11 ± 0.58|106.87 ± 22.16|130.59 ± 16.52|95.26 ± 6.39|18.08 ± 25.42| +| maze2d-medium-v1 |0.79 ± 3.25|14.25 ± 2.33|59.45 ± 36.25|52.88 ± 55.12|86.11 ± 9.68|34.85 ± 2.72|105.11 ± 31.67|88.61 ± 18.72|57.04 ± 3.45|31.71 ± 26.33| +| maze2d-large-v1 |2.26 ± 4.39|11.32 ± 5.10|97.10 ± 25.41|209.13 ± 8.19|23.75 ± 36.70|61.72 ± 3.50|78.33 ± 61.77|204.76 ± 1.19|95.60 ± 22.92|35.66 ± 28.20| +| | | | | | | | | | | | +| **maze2d average** |1.13|12.58|61.99|114.89|33.65|46.23|96.77|141.32|82.64|28.48| + +##### Antmaze +| **Task-Name** |BC|10% BC|TD3+BC|AWAC|CQL|IQL|ReBRAC|SAC-N|EDAC|DT| +|--------------------|------------|--------|--------|--------|-----|-----|------|-------|------|----| +|antmaze-umaze-v2|55.25 ± 4.15|65.75 ± 5.26|70.75 ± 39.18|57.75 ± 10.28|92.75 ± 1.92|77.00 ± 5.52|97.75 ± 1.48|0.00 ± 0.00|0.00 ± 0.00|57.00 ± 9.82| +|antmaze-umaze-diverse-v2|47.25 ± 4.09|44.00 ± 1.00|44.75 ± 11.61|58.00 ± 7.68|37.25 ± 3.70|54.25 ± 5.54|83.50 ± 7.02|0.00 ± 0.00|0.00 ± 0.00|51.75 ± 0.43| +|antmaze-medium-play-v2|0.00 ± 0.00|2.00 ± 0.71|0.25 ± 0.43|0.00 ± 0.00|65.75 ± 11.61|65.75 ± 11.71|89.50 ± 3.35|0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00| +|antmaze-medium-diverse-v2|0.75 ± 0.83|5.75 ± 9.39|0.25 ± 0.43|0.00 ± 0.00|67.25 ± 3.56|73.75 ± 5.45|83.50 ± 8.20|0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00| +|antmaze-large-play-v2|0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00|20.75 ± 7.26|42.00 ± 4.53|52.25 ± 29.01|0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00| +|antmaze-large-diverse-v2|0.00 ± 0.00|0.75 ± 0.83|0.00 ± 0.00|0.00 ± 0.00|20.50 ± 13.24|30.25 ± 3.63|64.00 ± 5.43|0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00| +| | | | | | | | | | | | +| **antmaze average** | 17.21|19.71|19.33|19.29|50.71|57.17|78.42|0.00|0.00|18.12| + +##### Adroit +| **Task-Name** |BC|10% BC|TD3+BC|AWAC|CQL|IQL|ReBRAC|SAC-N|EDAC|DT| +|--------------------|------------|--------|--------|--------|-----|-----|------|-------|------|----| +|pen-human-v1|71.03 ± 6.26|26.99 ± 9.60|-3.88 ± 0.21|81.12 ± 13.47|13.71 ± 16.98|78.49 ± 8.21|103.16 ± 8.49|6.86 ± 5.93|5.07 ± 6.16|67.68 ± 5.48| +|pen-cloned-v1|51.92 ± 15.15|46.67 ± 14.25|5.13 ± 5.28|89.56 ± 15.57|1.04 ± 6.62|83.42 ± 8.19|102.79 ± 7.84|31.35 ± 2.14|12.02 ± 1.75|64.43 ± 1.43| +|pen-expert-v1|109.65 ± 7.28|114.96 ± 2.96|122.53 ± 21.27|160.37 ± 1.21|-1.41 ± 2.34|128.05 ± 9.21|152.16 ± 6.33|87.11 ± 48.95|-1.55 ± 0.81|116.38 ± 1.27| +|door-human-v1|2.34 ± 4.00|-0.13 ± 0.07|-0.33 ± 0.01|4.60 ± 1.90|5.53 ± 1.31|3.26 ± 1.83|-0.10 ± 0.01|-0.38 ± 0.00|-0.12 ± 0.13|4.44 ± 0.87| +|door-cloned-v1|-0.09 ± 0.03|0.29 ± 0.59|-0.34 ± 0.01|0.93 ± 1.66|-0.33 ± 0.01|3.07 ± 1.75|0.06 ± 0.05|-0.33 ± 0.00|2.66 ± 2.31|7.64 ± 3.26| +|door-expert-v1|105.35 ± 0.09|104.04 ± 1.46|-0.33 ± 0.01|104.85 ± 0.24|-0.32 ± 0.02|106.65 ± 0.25|106.37 ± 0.29|-0.33 ± 0.00|106.29 ± 1.73|104.87 ± 0.39| +|hammer-human-v1|3.03 ± 3.39|-0.19 ± 0.02|1.02 ± 0.24|3.37 ± 1.93|0.14 ± 0.11|1.79 ± 0.80|0.24 ± 0.24|0.24 ± 0.00|0.28 ± 0.18|1.28 ± 0.15| +|hammer-cloned-v1|0.55 ± 0.16|0.12 ± 0.08|0.25 ± 0.01|0.21 ± 0.24|0.30 ± 0.01|1.50 ± 0.69|5.00 ± 3.75|0.14 ± 0.09|0.19 ± 0.07|1.82 ± 0.55| +|hammer-expert-v1|126.78 ± 0.64|121.75 ± 7.67|3.11 ± 0.03|127.06 ± 0.29|0.26 ± 0.01|128.68 ± 0.33|133.62 ± 0.27|25.13 ± 43.25|28.52 ± 49.00|117.45 ± 6.65| +|relocate-human-v1|0.04 ± 0.03|-0.14 ± 0.08|-0.29 ± 0.01|0.05 ± 0.03|0.06 ± 0.03|0.12 ± 0.04|0.16 ± 0.30|-0.31 ± 0.01|-0.17 ± 0.17|0.05 ± 0.01| +|relocate-cloned-v1|-0.06 ± 0.01|-0.00 ± 0.02|-0.30 ± 0.01|-0.04 ± 0.04|-0.29 ± 0.01|0.04 ± 0.01|1.66 ± 2.59|-0.01 ± 0.10|0.17 ± 0.35|0.16 ± 0.09| +|relocate-expert-v1|107.58 ± 1.20|97.90 ± 5.21|-1.73 ± 0.96|108.87 ± 0.85|-0.30 ± 0.02|106.11 ± 4.02|107.52 ± 2.28|-0.36 ± 0.00|71.94 ± 18.37|104.28 ± 0.42| +| | | | | | | | | | | | +| **adroit average** | 48.18|42.69|10.40|56.75|1.53|53.43|59.39|12.43|18.78|49.21| + +#### Best Scores +##### Gym-MuJoCo +| **Task-Name** |BC|10% BC|TD3+BC|AWAC|CQL|IQL|ReBRAC|SAC-N|EDAC|DT| +|--------------------|------------|--------|--------|--------|-----|-----|------|-------|------|----| +|halfcheetah-medium-v2|43.60 ± 0.14|43.90 ± 0.13|48.93 ± 0.11|50.06 ± 0.50|47.62 ± 0.03|48.84 ± 0.07|65.62 ± 0.46|72.21 ± 0.31|69.72 ± 0.92|42.73 ± 0.10| +|halfcheetah-medium-replay-v2|40.52 ± 0.19|42.27 ± 0.46|45.84 ± 0.26|46.35 ± 0.29|46.43 ± 0.19|45.35 ± 0.08|52.22 ± 0.31|67.29 ± 0.34|66.55 ± 1.05|40.31 ± 0.28| +|halfcheetah-medium-expert-v2|79.69 ± 3.10|94.11 ± 0.22|96.59 ± 0.87|96.11 ± 0.37|97.04 ± 0.17|95.38 ± 0.17|108.89 ± 1.20|111.73 ± 0.47|110.62 ± 1.04|93.40 ± 0.21| +|hopper-medium-v2|69.04 ± 2.90|73.84 ± 0.37|70.44 ± 1.18|97.90 ± 0.56|70.80 ± 1.98|80.46 ± 3.09|103.19 ± 0.16|101.79 ± 0.20|103.26 ± 0.14|69.42 ± 3.64| +|hopper-medium-replay-v2|68.88 ± 10.33|90.57 ± 2.07|98.12 ± 1.16|100.91 ± 1.50|101.63 ± 0.55|102.69 ± 0.96|102.57 ± 0.45|103.83 ± 0.53|103.28 ± 0.49|88.74 ± 3.02| +|hopper-medium-expert-v2|90.63 ± 10.98|113.13 ± 0.16|113.22 ± 0.43|103.82 ± 12.81|112.84 ± 0.66|113.18 ± 0.38|113.16 ± 0.43|111.24 ± 0.15|111.80 ± 0.11|111.18 ± 0.21| +|walker2d-medium-v2|80.64 ± 0.91|82.05 ± 0.93|86.91 ± 0.28|83.37 ± 2.82|84.77 ± 0.20|87.58 ± 0.48|87.79 ± 0.19|90.17 ± 0.54|95.78 ± 1.07|74.70 ± 0.56| +|walker2d-medium-replay-v2|48.41 ± 7.61|76.09 ± 0.40|91.17 ± 0.72|86.51 ± 1.15|89.39 ± 0.88|89.94 ± 0.93|91.11 ± 0.63|85.18 ± 1.63|89.69 ± 1.39|68.22 ± 1.20| +|walker2d-medium-expert-v2|109.95 ± 0.62|109.90 ± 0.09|112.21 ± 0.06|108.28 ± 9.45|111.63 ± 0.38|113.06 ± 0.53|112.49 ± 0.18|116.93 ± 0.42|116.52 ± 0.75|108.71 ± 0.34| +| | | | | | | | | | | | +| **locomotion average** | 70.15|80.65|84.83|85.92|84.68|86.28|93.00|95.60|96.36|77.49| + + +##### Maze2d +| **Task-Name** |BC|10% BC|TD3+BC|AWAC|CQL|IQL|ReBRAC|SAC-N|EDAC|DT| +|--------------------|------------|--------|--------|--------|-----|-----|------|-------|------|----| +|maze2d-umaze-v1|16.09 ± 0.87|22.49 ± 1.52|99.33 ± 16.16|136.61 ± 11.65|92.05 ± 13.66|50.92 ± 4.23|162.28 ± 1.79|153.12 ± 6.49|149.88 ± 1.97|63.83 ± 17.35| +|maze2d-medium-v1|19.16 ± 1.24|27.64 ± 1.87|150.93 ± 3.89|131.50 ± 25.38|128.66 ± 5.44|122.69 ± 30.00|150.12 ± 4.48|93.80 ± 14.66|154.41 ± 1.58|68.14 ± 12.25| +|maze2d-large-v1|20.75 ± 6.66|41.83 ± 3.64|197.64 ± 5.26|227.93 ± 1.90|157.51 ± 7.32|162.25 ± 44.18|197.55 ± 5.82|207.51 ± 0.96|182.52 ± 2.68|50.25 ± 19.34| +| | | | | | | | | | | | +| **maze2d average** | 18.67|30.65|149.30|165.35|126.07|111.95|169.98|151.48|162.27|60.74| + +##### Antmaze +| **Task-Name** |BC|10% BC|TD3+BC|AWAC|CQL|IQL|ReBRAC|SAC-N|EDAC|DT| +|--------------------|------------|--------|--------|--------|-----|-----|------|-------|------|----| +|antmaze-umaze-v2|68.50 ± 2.29|77.50 ± 1.50|98.50 ± 0.87|78.75 ± 6.76|94.75 ± 0.83|84.00 ± 4.06|100.00 ± 0.00|0.00 ± 0.00|42.50 ± 28.61|64.50 ± 2.06| +|antmaze-umaze-diverse-v2|64.75 ± 4.32|63.50 ± 2.18|71.25 ± 5.76|88.25 ± 2.17|53.75 ± 2.05|79.50 ± 3.35|96.75 ± 2.28|0.00 ± 0.00|0.00 ± 0.00|60.50 ± 2.29| +|antmaze-medium-play-v2|4.50 ± 1.12|6.25 ± 2.38|3.75 ± 1.30|27.50 ± 9.39|80.50 ± 3.35|78.50 ± 3.84|93.50 ± 2.60|0.00 ± 0.00|0.00 ± 0.00|0.75 ± 0.43| +|antmaze-medium-diverse-v2|4.75 ± 1.09|16.50 ± 5.59|5.50 ± 1.50|33.25 ± 16.81|71.00 ± 4.53|83.50 ± 1.80|91.75 ± 2.05|0.00 ± 0.00|0.00 ± 0.00|0.50 ± 0.50| +|antmaze-large-play-v2|0.50 ± 0.50|13.50 ± 9.76|1.25 ± 0.43|1.00 ± 0.71|34.75 ± 5.85|53.50 ± 2.50|68.75 ± 13.90|0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00| +|antmaze-large-diverse-v2|0.75 ± 0.43|6.25 ± 1.79|0.25 ± 0.43|0.50 ± 0.50|36.25 ± 3.34|53.00 ± 3.00|69.50 ± 7.26|0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00| +| | | | | | | | | | | | +| **antmaze average** |23.96|30.58|30.08|38.21|61.83|72.00|86.71|0.00|7.08|21.04| + +##### Adroit +| **Task-Name** |BC|10% BC|TD3+BC|AWAC|CQL|IQL|ReBRAC|SAC-N|EDAC|DT| +|--------------------|------------|--------|--------|--------|-----|-----|------|-------|------|----| +|pen-human-v1|99.69 ± 7.45|59.89 ± 8.03|9.95 ± 8.19|121.05 ± 5.47|58.91 ± 1.81|106.15 ± 10.28|127.28 ± 3.22|56.48 ± 7.17|35.84 ± 10.57|77.83 ± 2.30| +|pen-cloned-v1|99.14 ± 12.27|83.62 ± 11.75|52.66 ± 6.33|129.66 ± 1.27|14.74 ± 2.31|114.05 ± 4.78|128.64 ± 7.15|52.69 ± 5.30|26.90 ± 7.85|71.17 ± 2.70| +|pen-expert-v1|128.77 ± 5.88|134.36 ± 3.16|142.83 ± 7.72|162.69 ± 0.23|14.86 ± 4.07|140.01 ± 6.36|157.62 ± 0.26|116.43 ± 40.26|36.04 ± 4.60|119.49 ± 2.31| +|door-human-v1|9.41 ± 4.55|7.00 ± 6.77|-0.11 ± 0.06|19.28 ± 1.46|13.28 ± 2.77|13.52 ± 1.22|0.27 ± 0.43|-0.10 ± 0.06|2.51 ± 2.26|7.36 ± 1.24| +|door-cloned-v1|3.40 ± 0.95|10.37 ± 4.09|-0.20 ± 0.11|12.61 ± 0.60|-0.08 ± 0.13|9.02 ± 1.47|7.73 ± 6.80|-0.21 ± 0.10|20.36 ± 1.11|11.18 ± 0.96| +|door-expert-v1|105.84 ± 0.23|105.92 ± 0.24|4.49 ± 7.39|106.77 ± 0.24|59.47 ± 25.04|107.29 ± 0.37|106.78 ± 0.04|0.05 ± 0.02|109.22 ± 0.24|105.49 ± 0.09| +|hammer-human-v1|12.61 ± 4.87|6.23 ± 4.79|2.38 ± 0.14|22.03 ± 8.13|0.30 ± 0.05|6.86 ± 2.38|1.18 ± 0.15|0.25 ± 0.00|3.49 ± 2.17|1.68 ± 0.11| +|hammer-cloned-v1|8.90 ± 4.04|8.72 ± 3.28|0.96 ± 0.30|14.67 ± 1.94|0.32 ± 0.03|11.63 ± 1.70|48.16 ± 6.20|12.67 ± 15.02|0.27 ± 0.01|2.74 ± 0.22| +|hammer-expert-v1|127.89 ± 0.57|128.15 ± 0.66|33.31 ± 47.65|129.66 ± 0.33|0.93 ± 1.12|129.76 ± 0.37|134.74 ± 0.30|91.74 ± 47.77|69.44 ± 47.00|127.39 ± 0.10| +|relocate-human-v1|0.59 ± 0.27|0.16 ± 0.14|-0.29 ± 0.01|2.09 ± 0.76|1.03 ± 0.20|1.22 ± 0.28|3.70 ± 2.34|-0.18 ± 0.14|0.05 ± 0.02|0.08 ± 0.02| +|relocate-cloned-v1|0.45 ± 0.31|0.74 ± 0.45|-0.02 ± 0.04|0.94 ± 0.68|-0.07 ± 0.02|1.78 ± 0.70|9.25 ± 2.56|0.10 ± 0.04|4.11 ± 1.39|0.34 ± 0.09| +|relocate-expert-v1|110.31 ± 0.36|109.77 ± 0.60|0.23 ± 0.27|111.56 ± 0.17|0.03 ± 0.10|110.12 ± 0.82|111.14 ± 0.23|-0.07 ± 0.08|98.32 ± 3.75|106.49 ± 0.30| +| | | | | | | | | | | | +| **adroit average** | 58.92|54.58|20.51|69.42|13.65|62.62|69.71|27.49|33.88|52.60| + +### Offline-to-Online +#### Scores +| **Task-Name** |AWAC|CQL|IQL|SPOT|Cal-QL| +|---------------------------|------------|--------|--------|-----|-----| +|antmaze-umaze-v2|52.75 ± 8.67 → 98.75 ± 1.09|94.00 ± 1.58 → 99.50 ± 0.87|77.00 ± 0.71 → 96.50 ± 1.12|91.00 ± 2.55 → 99.50 ± 0.50|76.75 ± 7.53 → 99.75 ± 0.43| +|antmaze-umaze-diverse-v2|56.00 ± 2.74 → 0.00 ± 0.00|9.50 ± 9.91 → 99.00 ± 1.22|59.50 ± 9.55 → 63.75 ± 25.02|36.25 ± 2.17 → 95.00 ± 3.67|32.00 ± 27.79 → 98.50 ± 1.12| +|antmaze-medium-play-v2|0.00 ± 0.00 → 0.00 ± 0.00|59.00 ± 11.18 → 97.75 ± 1.30|71.75 ± 2.95 → 89.75 ± 1.09|67.25 ± 10.47 → 97.25 ± 1.30|71.75 ± 3.27 → 98.75 ± 1.64| +|antmaze-medium-diverse-v2|0.00 ± 0.00 → 0.00 ± 0.00|63.50 ± 6.84 → 97.25 ± 1.92|64.25 ± 1.92 → 92.25 ± 2.86|73.75 ± 7.29 → 94.50 ± 1.66|62.00 ± 4.30 → 98.25 ± 1.48| +|antmaze-large-play-v2|0.00 ± 0.00 → 0.00 ± 0.00|28.75 ± 7.76 → 88.25 ± 2.28|38.50 ± 8.73 → 64.50 ± 17.04|31.50 ± 12.58 → 87.00 ± 3.24|31.75 ± 8.87 → 97.25 ± 1.79| +|antmaze-large-diverse-v2|0.00 ± 0.00 → 0.00 ± 0.00|35.50 ± 3.64 → 91.75 ± 3.96|26.75 ± 3.77 → 64.25 ± 4.15|17.50 ± 7.26 → 81.00 ± 14.14|44.00 ± 8.69 → 91.50 ± 3.91| +| | | | | | | | | | | +| **antmaze average** |18.12 → 16.46|48.38 → 95.58|56.29 → 78.50|52.88 → 92.38|53.04 → 97.33| +| | | | | | | | | | | +|pen-cloned-v1|88.66 ± 15.10 → 86.82 ± 11.12|-2.76 ± 0.08 → -1.28 ± 2.16|84.19 ± 3.96 → 102.02 ± 20.75|6.19 ± 5.21 → 43.63 ± 20.09|-2.66 ± 0.04 → -2.68 ± 0.12| +|door-cloned-v1|0.93 ± 1.66 → 0.01 ± 0.00|-0.33 ± 0.01 → -0.33 ± 0.01|1.19 ± 0.93 → 20.34 ± 9.32|-0.21 ± 0.14 → 0.02 ± 0.31|-0.33 ± 0.01 → -0.33 ± 0.01| +|hammer-cloned-v1|1.80 ± 3.01 → 0.24 ± 0.04|0.56 ± 0.55 → 2.85 ± 4.81|1.35 ± 0.32 → 57.27 ± 28.49|3.97 ± 6.39 → 3.73 ± 4.99|0.25 ± 0.04 → 0.17 ± 0.17| +|relocate-cloned-v1|-0.04 ± 0.04 → -0.04 ± 0.01|-0.33 ± 0.01 → -0.33 ± 0.01|0.04 ± 0.04 → 0.32 ± 0.38|-0.24 ± 0.01 → -0.15 ± 0.05|-0.31 ± 0.05 → -0.31 ± 0.04| +| | | | | | | | | | | +| **adroit average** |22.84 → 21.76|-0.72 → 0.22|21.69 → 44.99|2.43 → 11.81|-0.76 → -0.79| + +#### Regrets +| **Task-Name** |AWAC|CQL|IQL|SPOT|Cal-QL| +|---------------------------|------------|--------|--------|-----|-----| +|antmaze-umaze-v2|0.04 ± 0.01|0.02 ± 0.00|0.07 ± 0.00|0.02 ± 0.00|0.01 ± 0.00| +|antmaze-umaze-diverse-v2|0.88 ± 0.01|0.09 ± 0.01|0.43 ± 0.11|0.22 ± 0.07|0.05 ± 0.01| +|antmaze-medium-play-v2|1.00 ± 0.00|0.08 ± 0.01|0.09 ± 0.01|0.06 ± 0.00|0.04 ± 0.01| +|antmaze-medium-diverse-v2|1.00 ± 0.00|0.08 ± 0.00|0.10 ± 0.01|0.05 ± 0.01|0.04 ± 0.01| +|antmaze-large-play-v2|1.00 ± 0.00|0.21 ± 0.02|0.34 ± 0.05|0.29 ± 0.07|0.13 ± 0.02| +|antmaze-large-diverse-v2|1.00 ± 0.00|0.21 ± 0.03|0.41 ± 0.03|0.23 ± 0.08|0.13 ± 0.02| +| | | | | | | | | | | +| **antmaze average** |0.82|0.11|0.24|0.15|0.07| +| | | | | | | | | | | +|pen-cloned-v1|0.46 ± 0.02|0.97 ± 0.00|0.37 ± 0.01|0.58 ± 0.02|0.98 ± 0.01| +|door-cloned-v1|1.00 ± 0.00|1.00 ± 0.00|0.83 ± 0.03|0.99 ± 0.01|1.00 ± 0.00| +|hammer-cloned-v1|1.00 ± 0.00|1.00 ± 0.00|0.65 ± 0.10|0.98 ± 0.01|1.00 ± 0.00| +|relocate-cloned-v1|1.00 ± 0.00|1.00 ± 0.00|1.00 ± 0.00|1.00 ± 0.00|1.00 ± 0.00| +| | | | | | | | | | | +| **adroit average** |0.86|0.99|0.71|0.89|0.99| ## Citing CORL diff --git a/algorithms/finetune/awac.py b/algorithms/finetune/awac.py new file mode 100644 index 00000000..e44cd26a --- /dev/null +++ b/algorithms/finetune/awac.py @@ -0,0 +1,623 @@ +import os +import random +import uuid +from copy import deepcopy +from dataclasses import asdict, dataclass +from typing import Any, Dict, List, Optional, Tuple, Union + +import d4rl +import gym +import numpy as np +import pyrallis +import torch +import torch.nn as nn +import torch.nn.functional +import wandb +from tqdm import trange + +TensorBatch = List[torch.Tensor] + +ENVS_WITH_GOAL = ("antmaze", "pen", "door", "hammer", "relocate") + + +@dataclass +class TrainConfig: + project: str = "CORL" + group: str = "AWAC-D4RL" + name: str = "AWAC" + checkpoints_path: Optional[str] = None + + env_name: str = "halfcheetah-medium-expert-v2" + seed: int = 42 + eval_seed: int = 0 # Eval environment seed + test_seed: int = 69 + deterministic_torch: bool = True + device: str = "cuda" + + buffer_size: int = 20_000_000 + offline_iterations: int = int(1e6) # Number of offline updates + online_iterations: int = int(1e6) # Number of online updates + batch_size: int = 256 + eval_frequency: int = 1000 + n_test_episodes: int = 10 + normalize_reward: bool = False + + hidden_dim: int = 256 + learning_rate: float = 3e-4 + gamma: float = 0.99 + tau: float = 5e-3 + awac_lambda: float = 1.0 + + def __post_init__(self): + self.name = f"{self.name}-{self.env_name}-{str(uuid.uuid4())[:8]}" + if self.checkpoints_path is not None: + self.checkpoints_path = os.path.join(self.checkpoints_path, self.name) + + +class ReplayBuffer: + def __init__( + self, + state_dim: int, + action_dim: int, + buffer_size: int, + device: str = "cpu", + ): + self._buffer_size = buffer_size + self._pointer = 0 + self._size = 0 + + self._states = torch.zeros( + (buffer_size, state_dim), dtype=torch.float32, device=device + ) + self._actions = torch.zeros( + (buffer_size, action_dim), dtype=torch.float32, device=device + ) + self._rewards = torch.zeros((buffer_size, 1), dtype=torch.float32, device=device) + self._next_states = torch.zeros( + (buffer_size, state_dim), dtype=torch.float32, device=device + ) + self._dones = torch.zeros((buffer_size, 1), dtype=torch.float32, device=device) + self._device = device + + def _to_tensor(self, data: np.ndarray) -> torch.Tensor: + return torch.tensor(data, dtype=torch.float32, device=self._device) + + def load_d4rl_dataset(self, data: Dict[str, np.ndarray]): + if self._size != 0: + raise ValueError("Trying to load data into non-empty replay buffer") + n_transitions = data["observations"].shape[0] + if n_transitions > self._buffer_size: + raise ValueError( + "Replay buffer is smaller than the dataset you are trying to load!" + ) + self._states[:n_transitions] = self._to_tensor(data["observations"]) + self._actions[:n_transitions] = self._to_tensor(data["actions"]) + self._rewards[:n_transitions] = self._to_tensor(data["rewards"][..., None]) + self._next_states[:n_transitions] = self._to_tensor(data["next_observations"]) + self._dones[:n_transitions] = self._to_tensor(data["terminals"][..., None]) + self._size += n_transitions + self._pointer = min(self._size, n_transitions) + + print(f"Dataset size: {n_transitions}") + + def sample(self, batch_size: int) -> TensorBatch: + indices = np.random.randint(0, self._size, size=batch_size) + states = self._states[indices] + actions = self._actions[indices] + rewards = self._rewards[indices] + next_states = self._next_states[indices] + dones = self._dones[indices] + return [states, actions, rewards, next_states, dones] + + def add_transition( + self, + state: np.ndarray, + action: np.ndarray, + reward: float, + next_state: np.ndarray, + done: bool, + ): + # Use this method to add new data into the replay buffer during fine-tuning. + self._states[self._pointer] = self._to_tensor(state) + self._actions[self._pointer] = self._to_tensor(action) + self._rewards[self._pointer] = self._to_tensor(reward) + self._next_states[self._pointer] = self._to_tensor(next_state) + self._dones[self._pointer] = self._to_tensor(done) + + self._pointer = (self._pointer + 1) % self._buffer_size + self._size = min(self._size + 1, self._buffer_size) + + +def set_env_seed(env: Optional[gym.Env], seed: int): + env.seed(seed) + env.action_space.seed(seed) + + +class Actor(nn.Module): + def __init__( + self, + state_dim: int, + action_dim: int, + hidden_dim: int, + min_log_std: float = -20.0, + max_log_std: float = 2.0, + min_action: float = -1.0, + max_action: float = 1.0, + ): + super().__init__() + self._mlp = nn.Sequential( + nn.Linear(state_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, action_dim), + ) + self._log_std = nn.Parameter(torch.zeros(action_dim, dtype=torch.float32)) + self._min_log_std = min_log_std + self._max_log_std = max_log_std + self._min_action = min_action + self._max_action = max_action + + def _get_policy(self, state: torch.Tensor) -> torch.distributions.Distribution: + mean = self._mlp(state) + log_std = self._log_std.clamp(self._min_log_std, self._max_log_std) + policy = torch.distributions.Normal(mean, log_std.exp()) + return policy + + def log_prob(self, state: torch.Tensor, action: torch.Tensor) -> torch.Tensor: + policy = self._get_policy(state) + log_prob = policy.log_prob(action).sum(-1, keepdim=True) + return log_prob + + def forward(self, state: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + policy = self._get_policy(state) + action = policy.rsample() + action.clamp_(self._min_action, self._max_action) + log_prob = policy.log_prob(action).sum(-1, keepdim=True) + return action, log_prob + + def act(self, state: np.ndarray, device: str) -> np.ndarray: + state_t = torch.tensor(state[None], dtype=torch.float32, device=device) + policy = self._get_policy(state_t) + if self._mlp.training: + action_t = policy.sample() + else: + action_t = policy.mean + action = action_t[0].cpu().numpy() + return action + + +class Critic(nn.Module): + def __init__( + self, + state_dim: int, + action_dim: int, + hidden_dim: int, + ): + super().__init__() + self._mlp = nn.Sequential( + nn.Linear(state_dim + action_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, 1), + ) + + def forward(self, state: torch.Tensor, action: torch.Tensor) -> torch.Tensor: + q_value = self._mlp(torch.cat([state, action], dim=-1)) + return q_value + + +def soft_update(target: nn.Module, source: nn.Module, tau: float): + for target_param, source_param in zip(target.parameters(), source.parameters()): + target_param.data.copy_((1 - tau) * target_param.data + tau * source_param.data) + + +class AdvantageWeightedActorCritic: + def __init__( + self, + actor: nn.Module, + actor_optimizer: torch.optim.Optimizer, + critic_1: nn.Module, + critic_1_optimizer: torch.optim.Optimizer, + critic_2: nn.Module, + critic_2_optimizer: torch.optim.Optimizer, + gamma: float = 0.99, + tau: float = 5e-3, # parameter for the soft target update, + awac_lambda: float = 1.0, + exp_adv_max: float = 100.0, + ): + self._actor = actor + self._actor_optimizer = actor_optimizer + + self._critic_1 = critic_1 + self._critic_1_optimizer = critic_1_optimizer + self._target_critic_1 = deepcopy(critic_1) + + self._critic_2 = critic_2 + self._critic_2_optimizer = critic_2_optimizer + self._target_critic_2 = deepcopy(critic_2) + + self._gamma = gamma + self._tau = tau + self._awac_lambda = awac_lambda + self._exp_adv_max = exp_adv_max + + def _actor_loss( + self, + states: torch.Tensor, + actions: torch.Tensor, + ) -> torch.Tensor: + with torch.no_grad(): + pi_action, _ = self._actor(states) + v = torch.min( + self._critic_1(states, pi_action), self._critic_2(states, pi_action) + ) + + q = torch.min( + self._critic_1(states, actions), self._critic_2(states, actions) + ) + adv = q - v + weights = torch.clamp_max( + torch.exp(adv / self._awac_lambda), self._exp_adv_max + ) + + action_log_prob = self._actor.log_prob(states, actions) + loss = (-action_log_prob * weights).mean() + return loss + + def _critic_loss( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + dones: torch.Tensor, + next_states: torch.Tensor, + ) -> torch.Tensor: + with torch.no_grad(): + next_actions, _ = self._actor(next_states) + + q_next = torch.min( + self._target_critic_1(next_states, next_actions), + self._target_critic_2(next_states, next_actions), + ) + q_target = rewards + self._gamma * (1.0 - dones) * q_next + + q1 = self._critic_1(states, actions) + q2 = self._critic_2(states, actions) + + q1_loss = nn.functional.mse_loss(q1, q_target) + q2_loss = nn.functional.mse_loss(q2, q_target) + loss = q1_loss + q2_loss + return loss + + def _update_critic( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + dones: torch.Tensor, + next_states: torch.Tensor, + ): + loss = self._critic_loss(states, actions, rewards, dones, next_states) + self._critic_1_optimizer.zero_grad() + self._critic_2_optimizer.zero_grad() + loss.backward() + self._critic_1_optimizer.step() + self._critic_2_optimizer.step() + return loss.item() + + def _update_actor(self, states, actions): + loss = self._actor_loss(states, actions) + self._actor_optimizer.zero_grad() + loss.backward() + self._actor_optimizer.step() + return loss.item() + + def update(self, batch: TensorBatch) -> Dict[str, float]: + states, actions, rewards, next_states, dones = batch + critic_loss = self._update_critic(states, actions, rewards, dones, next_states) + actor_loss = self._update_actor(states, actions) + + soft_update(self._target_critic_1, self._critic_1, self._tau) + soft_update(self._target_critic_2, self._critic_2, self._tau) + + result = {"critic_loss": critic_loss, "actor_loss": actor_loss} + return result + + def state_dict(self) -> Dict[str, Any]: + return { + "actor": self._actor.state_dict(), + "critic_1": self._critic_1.state_dict(), + "critic_2": self._critic_2.state_dict(), + } + + def load_state_dict(self, state_dict: Dict[str, Any]): + self._actor.load_state_dict(state_dict["actor"]) + self._critic_1.load_state_dict(state_dict["critic_1"]) + self._critic_2.load_state_dict(state_dict["critic_2"]) + + +def set_seed( + seed: int, env: Optional[gym.Env] = None, deterministic_torch: bool = False +): + if env is not None: + set_env_seed(env, seed) + os.environ["PYTHONHASHSEED"] = str(seed) + np.random.seed(seed) + random.seed(seed) + torch.manual_seed(seed) + torch.use_deterministic_algorithms(deterministic_torch) + + +def compute_mean_std(states: np.ndarray, eps: float) -> Tuple[np.ndarray, np.ndarray]: + mean = states.mean(0) + std = states.std(0) + eps + return mean, std + + +def normalize_states(states: np.ndarray, mean: np.ndarray, std: np.ndarray): + return (states - mean) / std + + +def wrap_env( + env: gym.Env, + state_mean: Union[np.ndarray, float] = 0.0, + state_std: Union[np.ndarray, float] = 1.0, +) -> gym.Env: + def normalize_state(state): + return (state - state_mean) / state_std + + env = gym.wrappers.TransformObservation(env, normalize_state) + return env + + +def is_goal_reached(reward: float, info: Dict) -> bool: + if "goal_achieved" in info: + return info["goal_achieved"] + return reward > 0 # Assuming that reaching target is a positive reward + + +@torch.no_grad() +def eval_actor( + env: gym.Env, actor: Actor, device: str, n_episodes: int, seed: int +) -> Tuple[np.ndarray, np.ndarray]: + env.seed(seed) + actor.eval() + episode_rewards = [] + successes = [] + for _ in range(n_episodes): + state, done = env.reset(), False + episode_reward = 0.0 + goal_achieved = False + while not done: + action = actor.act(state, device) + state, reward, done, env_infos = env.step(action) + episode_reward += reward + if not goal_achieved: + goal_achieved = is_goal_reached(reward, env_infos) + # Valid only for environments with goal + successes.append(float(goal_achieved)) + episode_rewards.append(episode_reward) + + actor.train() + return np.asarray(episode_rewards), np.mean(successes) + + +def return_reward_range(dataset: Dict, max_episode_steps: int) -> Tuple[float, float]: + returns, lengths = [], [] + ep_ret, ep_len = 0.0, 0 + for r, d in zip(dataset["rewards"], dataset["terminals"]): + ep_ret += float(r) + ep_len += 1 + if d or ep_len == max_episode_steps: + returns.append(ep_ret) + lengths.append(ep_len) + ep_ret, ep_len = 0.0, 0 + lengths.append(ep_len) # but still keep track of number of steps + assert sum(lengths) == len(dataset["rewards"]) + return min(returns), max(returns) + + +def modify_reward(dataset: Dict, env_name: str, max_episode_steps: int = 1000) -> Dict: + if any(s in env_name for s in ("halfcheetah", "hopper", "walker2d")): + min_ret, max_ret = return_reward_range(dataset, max_episode_steps) + dataset["rewards"] /= max_ret - min_ret + dataset["rewards"] *= max_episode_steps + return { + "max_ret": max_ret, + "min_ret": min_ret, + "max_episode_steps": max_episode_steps, + } + elif "antmaze" in env_name: + dataset["rewards"] -= 1.0 + return {} + + +def modify_reward_online(reward: float, env_name: str, **kwargs) -> float: + if any(s in env_name for s in ("halfcheetah", "hopper", "walker2d")): + reward /= kwargs["max_ret"] - kwargs["min_ret"] + reward *= kwargs["max_episode_steps"] + elif "antmaze" in env_name: + reward -= 1.0 + return reward + + +def wandb_init(config: dict) -> None: + wandb.init( + config=config, + project=config["project"], + group=config["group"], + name=config["name"], + id=str(uuid.uuid4()), + ) + wandb.run.save() + + +@pyrallis.wrap() +def train(config: TrainConfig): + env = gym.make(config.env_name) + eval_env = gym.make(config.env_name) + + is_env_with_goal = config.env_name.startswith(ENVS_WITH_GOAL) + + max_steps = env._max_episode_steps + + set_seed(config.seed, env, deterministic_torch=config.deterministic_torch) + set_env_seed(eval_env, config.eval_seed) + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.shape[0] + dataset = d4rl.qlearning_dataset(env) + + reward_mod_dict = {} + if config.normalize_reward: + reward_mod_dict = modify_reward(dataset, config.env_name) + + state_mean, state_std = compute_mean_std(dataset["observations"], eps=1e-3) + dataset["observations"] = normalize_states( + dataset["observations"], state_mean, state_std + ) + dataset["next_observations"] = normalize_states( + dataset["next_observations"], state_mean, state_std + ) + env = wrap_env(env, state_mean=state_mean, state_std=state_std) + eval_env = wrap_env(eval_env, state_mean=state_mean, state_std=state_std) + replay_buffer = ReplayBuffer( + state_dim, + action_dim, + config.buffer_size, + config.device, + ) + replay_buffer.load_d4rl_dataset(dataset) + + actor_critic_kwargs = { + "state_dim": state_dim, + "action_dim": action_dim, + "hidden_dim": config.hidden_dim, + } + + actor = Actor(**actor_critic_kwargs) + actor.to(config.device) + actor_optimizer = torch.optim.Adam(actor.parameters(), lr=config.learning_rate) + critic_1 = Critic(**actor_critic_kwargs) + critic_2 = Critic(**actor_critic_kwargs) + critic_1.to(config.device) + critic_2.to(config.device) + critic_1_optimizer = torch.optim.Adam(critic_1.parameters(), lr=config.learning_rate) + critic_2_optimizer = torch.optim.Adam(critic_2.parameters(), lr=config.learning_rate) + + awac = AdvantageWeightedActorCritic( + actor=actor, + actor_optimizer=actor_optimizer, + critic_1=critic_1, + critic_1_optimizer=critic_1_optimizer, + critic_2=critic_2, + critic_2_optimizer=critic_2_optimizer, + gamma=config.gamma, + tau=config.tau, + awac_lambda=config.awac_lambda, + ) + wandb_init(asdict(config)) + + if config.checkpoints_path is not None: + print(f"Checkpoints path: {config.checkpoints_path}") + os.makedirs(config.checkpoints_path, exist_ok=True) + with open(os.path.join(config.checkpoints_path, "config.yaml"), "w") as f: + pyrallis.dump(config, f) + + full_eval_scores, full_normalized_eval_scores = [], [] + state, done = env.reset(), False + episode_step = 0 + episode_return = 0 + goal_achieved = False + + eval_successes = [] + train_successes = [] + + print("Offline pretraining") + for t in trange( + int(config.offline_iterations) + int(config.online_iterations), ncols=80 + ): + if t == config.offline_iterations: + print("Online tuning") + online_log = {} + if t >= config.offline_iterations: + episode_step += 1 + action, _ = actor( + torch.tensor( + state.reshape(1, -1), device=config.device, dtype=torch.float32 + ) + ) + action = action.cpu().data.numpy().flatten() + next_state, reward, done, env_infos = env.step(action) + + if not goal_achieved: + goal_achieved = is_goal_reached(reward, env_infos) + episode_return += reward + real_done = False # Episode can timeout which is different from done + if done and episode_step < max_steps: + real_done = True + + if config.normalize_reward: + reward = modify_reward_online(reward, config.env_name, **reward_mod_dict) + + replay_buffer.add_transition(state, action, reward, next_state, real_done) + state = next_state + if done: + state, done = env.reset(), False + # Valid only for envs with goal, e.g. AntMaze, Adroit + if is_env_with_goal: + train_successes.append(goal_achieved) + online_log["train/regret"] = np.mean(1 - np.array(train_successes)) + online_log["train/is_success"] = float(goal_achieved) + online_log["train/episode_return"] = episode_return + normalized_return = eval_env.get_normalized_score(episode_return) + online_log["train/d4rl_normalized_episode_return"] = ( + normalized_return * 100.0 + ) + online_log["train/episode_length"] = episode_step + episode_return = 0 + episode_step = 0 + goal_achieved = False + + batch = replay_buffer.sample(config.batch_size) + batch = [b.to(config.device) for b in batch] + update_result = awac.update(batch) + update_result[ + "offline_iter" if t < config.offline_iterations else "online_iter" + ] = (t if t < config.offline_iterations else t - config.offline_iterations) + update_result.update(online_log) + wandb.log(update_result, step=t) + if (t + 1) % config.eval_frequency == 0: + eval_scores, success_rate = eval_actor( + eval_env, actor, config.device, config.n_test_episodes, config.test_seed + ) + eval_log = {} + + full_eval_scores.append(eval_scores) + wandb.log({"eval/eval_score": eval_scores.mean()}, step=t) + if hasattr(eval_env, "get_normalized_score"): + normalized = eval_env.get_normalized_score(np.mean(eval_scores)) + # Valid only for envs with goal, e.g. AntMaze, Adroit + if t >= config.offline_iterations and is_env_with_goal: + eval_successes.append(success_rate) + eval_log["eval/regret"] = np.mean(1 - np.array(train_successes)) + eval_log["eval/success_rate"] = success_rate + normalized_eval_scores = normalized * 100.0 + full_normalized_eval_scores.append(normalized_eval_scores) + eval_log["eval/d4rl_normalized_score"] = normalized_eval_scores + wandb.log(eval_log, step=t) + if config.checkpoints_path: + torch.save( + awac.state_dict(), + os.path.join(config.checkpoints_path, f"checkpoint_{t}.pt"), + ) + wandb.finish() + + +if __name__ == "__main__": + train() diff --git a/algorithms/finetune/cal_ql.py b/algorithms/finetune/cal_ql.py new file mode 100644 index 00000000..ea2e3df7 --- /dev/null +++ b/algorithms/finetune/cal_ql.py @@ -0,0 +1,1234 @@ +# source: https://github.com/nakamotoo/Cal-QL/tree/main +# https://arxiv.org/pdf/2303.05479.pdf +import os +import random +import uuid +from copy import deepcopy +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union + +import d4rl +import gym +import numpy as np +import pyrallis +import torch +import torch.nn as nn +import torch.nn.functional as F +import wandb +from torch.distributions import Normal, TanhTransform, TransformedDistribution + +TensorBatch = List[torch.Tensor] + +ENVS_WITH_GOAL = ("antmaze", "pen", "door", "hammer", "relocate") + + +@dataclass +class TrainConfig: + # Experiment + device: str = "cuda" + env: str = "halfcheetah-medium-expert-v2" # OpenAI gym environment name + seed: int = 0 # Sets Gym, PyTorch and Numpy seeds + eval_seed: int = 0 # Eval environment seed + eval_freq: int = int(5e3) # How often (time steps) we evaluate + n_episodes: int = 10 # How many episodes run during evaluation + offline_iterations: int = int(1e6) # Number of offline updates + online_iterations: int = int(1e6) # Number of online updates + checkpoints_path: Optional[str] = None # Save path + load_model: str = "" # Model load file name, "" doesn't load + # CQL + buffer_size: int = 2_000_000 # Replay buffer size + batch_size: int = 256 # Batch size for all networks + discount: float = 0.99 # Discount factor + alpha_multiplier: float = 1.0 # Multiplier for alpha in loss + use_automatic_entropy_tuning: bool = True # Tune entropy + backup_entropy: bool = False # Use backup entropy + policy_lr: float = 3e-5 # Policy learning rate + qf_lr: float = 3e-4 # Critics learning rate + soft_target_update_rate: float = 5e-3 # Target network update rate + bc_steps: int = int(0) # Number of BC steps at start + target_update_period: int = 1 # Frequency of target nets updates + cql_alpha: float = 10.0 # CQL offline regularization parameter + cql_alpha_online: float = 10.0 # CQL online regularization parameter + cql_n_actions: int = 10 # Number of sampled actions + cql_importance_sample: bool = True # Use importance sampling + cql_lagrange: bool = False # Use Lagrange version of CQL + cql_target_action_gap: float = -1.0 # Action gap + cql_temp: float = 1.0 # CQL temperature + cql_max_target_backup: bool = False # Use max target backup + cql_clip_diff_min: float = -np.inf # Q-function lower loss clipping + cql_clip_diff_max: float = np.inf # Q-function upper loss clipping + orthogonal_init: bool = True # Orthogonal initialization + normalize: bool = True # Normalize states + normalize_reward: bool = False # Normalize reward + q_n_hidden_layers: int = 2 # Number of hidden layers in Q networks + reward_scale: float = 1.0 # Reward scale for normalization + reward_bias: float = 0.0 # Reward bias for normalization + # Cal-QL + mixing_ratio: float = 0.5 # Data mixing ratio for online tuning + is_sparse_reward: bool = False # Use sparse reward + # Wandb logging + project: str = "CORL" + group: str = "Cal-QL-D4RL" + name: str = "Cal-QL" + + def __post_init__(self): + self.name = f"{self.name}-{self.env}-{str(uuid.uuid4())[:8]}" + if self.checkpoints_path is not None: + self.checkpoints_path = os.path.join(self.checkpoints_path, self.name) + + +def soft_update(target: nn.Module, source: nn.Module, tau: float): + for target_param, source_param in zip(target.parameters(), source.parameters()): + target_param.data.copy_((1 - tau) * target_param.data + tau * source_param.data) + + +def compute_mean_std(states: np.ndarray, eps: float) -> Tuple[np.ndarray, np.ndarray]: + mean = states.mean(0) + std = states.std(0) + eps + return mean, std + + +def normalize_states(states: np.ndarray, mean: np.ndarray, std: np.ndarray): + return (states - mean) / std + + +def wrap_env( + env: gym.Env, + state_mean: Union[np.ndarray, float] = 0.0, + state_std: Union[np.ndarray, float] = 1.0, + reward_scale: float = 1.0, +) -> gym.Env: + # PEP 8: E731 do not assign a lambda expression, use a def + def normalize_state(state): + return ( + state - state_mean + ) / state_std # epsilon should be already added in std. + + def scale_reward(reward): + # Please be careful, here reward is multiplied by scale! + return reward_scale * reward + + env = gym.wrappers.TransformObservation(env, normalize_state) + if reward_scale != 1.0: + env = gym.wrappers.TransformReward(env, scale_reward) + return env + + +class ReplayBuffer: + def __init__( + self, + state_dim: int, + action_dim: int, + buffer_size: int, + device: str = "cpu", + ): + self._buffer_size = buffer_size + self._pointer = 0 + self._size = 0 + + self._states = torch.zeros( + (buffer_size, state_dim), dtype=torch.float32, device=device + ) + self._actions = torch.zeros( + (buffer_size, action_dim), dtype=torch.float32, device=device + ) + self._rewards = torch.zeros((buffer_size, 1), dtype=torch.float32, device=device) + self._next_states = torch.zeros( + (buffer_size, state_dim), dtype=torch.float32, device=device + ) + self._dones = torch.zeros((buffer_size, 1), dtype=torch.float32, device=device) + self._mc_returns = torch.zeros( + (buffer_size, 1), dtype=torch.float32, device=device + ) + + self._device = device + + def _to_tensor(self, data: np.ndarray) -> torch.Tensor: + return torch.tensor(data, dtype=torch.float32, device=self._device) + + # Loads data in d4rl format, i.e. from Dict[str, np.array]. + def load_d4rl_dataset(self, data: Dict[str, np.ndarray]): + if self._size != 0: + raise ValueError("Trying to load data into non-empty replay buffer") + n_transitions = data["observations"].shape[0] + if n_transitions > self._buffer_size: + raise ValueError( + "Replay buffer is smaller than the dataset you are trying to load!" + ) + self._states[:n_transitions] = self._to_tensor(data["observations"]) + self._actions[:n_transitions] = self._to_tensor(data["actions"]) + self._rewards[:n_transitions] = self._to_tensor(data["rewards"][..., None]) + self._next_states[:n_transitions] = self._to_tensor(data["next_observations"]) + self._dones[:n_transitions] = self._to_tensor(data["terminals"][..., None]) + self._mc_returns[:n_transitions] = self._to_tensor(data["mc_returns"][..., None]) + self._size += n_transitions + self._pointer = min(self._size, n_transitions) + + print(f"Dataset size: {n_transitions}") + + def sample(self, batch_size: int) -> TensorBatch: + indices = np.random.randint(0, self._size, size=batch_size) + states = self._states[indices] + actions = self._actions[indices] + rewards = self._rewards[indices] + next_states = self._next_states[indices] + dones = self._dones[indices] + mc_returns = self._mc_returns[indices] + return [states, actions, rewards, next_states, dones, mc_returns] + + def add_transition( + self, + state: np.ndarray, + action: np.ndarray, + reward: float, + next_state: np.ndarray, + done: bool, + ): + # Use this method to add new data into the replay buffer during fine-tuning. + self._states[self._pointer] = self._to_tensor(state) + self._actions[self._pointer] = self._to_tensor(action) + self._rewards[self._pointer] = self._to_tensor(reward) + self._next_states[self._pointer] = self._to_tensor(next_state) + self._dones[self._pointer] = self._to_tensor(done) + self._mc_returns[self._pointer] = 0.0 + + self._pointer = (self._pointer + 1) % self._buffer_size + self._size = min(self._size + 1, self._buffer_size) + + +def set_env_seed(env: Optional[gym.Env], seed: int): + env.seed(seed) + env.action_space.seed(seed) + + +def set_seed( + seed: int, env: Optional[gym.Env] = None, deterministic_torch: bool = False +): + if env is not None: + set_env_seed(env, seed) + os.environ["PYTHONHASHSEED"] = str(seed) + np.random.seed(seed) + random.seed(seed) + torch.manual_seed(seed) + torch.use_deterministic_algorithms(deterministic_torch) + + +def wandb_init(config: dict) -> None: + wandb.init( + config=config, + project=config["project"], + group=config["group"], + name=config["name"], + id=str(uuid.uuid4()), + ) + wandb.run.save() + + +def is_goal_reached(reward: float, info: Dict) -> bool: + if "goal_achieved" in info: + return info["goal_achieved"] + return reward > 0 # Assuming that reaching target is a positive reward + + +@torch.no_grad() +def eval_actor( + env: gym.Env, actor: nn.Module, device: str, n_episodes: int, seed: int +) -> Tuple[np.ndarray, np.ndarray]: + env.seed(seed) + actor.eval() + episode_rewards = [] + successes = [] + for _ in range(n_episodes): + state, done = env.reset(), False + episode_reward = 0.0 + goal_achieved = False + while not done: + action = actor.act(state, device) + state, reward, done, env_infos = env.step(action) + episode_reward += reward + if not goal_achieved: + goal_achieved = is_goal_reached(reward, env_infos) + # Valid only for environments with goal + successes.append(float(goal_achieved)) + episode_rewards.append(episode_reward) + + actor.train() + return np.asarray(episode_rewards), np.mean(successes) + + +def return_reward_range(dataset: Dict, max_episode_steps: int) -> Tuple[float, float]: + returns, lengths = [], [] + ep_ret, ep_len = 0.0, 0 + for r, d in zip(dataset["rewards"], dataset["terminals"]): + ep_ret += float(r) + ep_len += 1 + if d or ep_len == max_episode_steps: + returns.append(ep_ret) + lengths.append(ep_len) + ep_ret, ep_len = 0.0, 0 + lengths.append(ep_len) # but still keep track of number of steps + assert sum(lengths) == len(dataset["rewards"]) + return min(returns), max(returns) + + +def get_return_to_go(dataset: Dict, env: gym.Env, config: TrainConfig) -> np.ndarray: + returns = [] + ep_ret, ep_len = 0.0, 0 + cur_rewards = [] + terminals = [] + N = len(dataset["rewards"]) + for t, (r, d) in enumerate(zip(dataset["rewards"], dataset["terminals"])): + ep_ret += float(r) + cur_rewards.append(float(r)) + terminals.append(float(d)) + ep_len += 1 + is_last_step = ( + (t == N - 1) + or ( + np.linalg.norm( + dataset["observations"][t + 1] - dataset["next_observations"][t] + ) + > 1e-6 + ) + or ep_len == env._max_episode_steps + ) + + if d or is_last_step: + discounted_returns = [0] * ep_len + prev_return = 0 + if ( + config.is_sparse_reward + and r + == env.ref_min_score * config.reward_scale + config.reward_bias + ): + discounted_returns = [r / (1 - config.discount)] * ep_len + else: + for i in reversed(range(ep_len)): + discounted_returns[i] = cur_rewards[ + i + ] + config.discount * prev_return * (1 - terminals[i]) + prev_return = discounted_returns[i] + returns += discounted_returns + ep_ret, ep_len = 0.0, 0 + cur_rewards = [] + terminals = [] + return returns + + +def modify_reward( + dataset: Dict, + env_name: str, + max_episode_steps: int = 1000, + reward_scale: float = 1.0, + reward_bias: float = 0.0, +) -> Dict: + modification_data = {} + if any(s in env_name for s in ("halfcheetah", "hopper", "walker2d")): + min_ret, max_ret = return_reward_range(dataset, max_episode_steps) + dataset["rewards"] /= max_ret - min_ret + dataset["rewards"] *= max_episode_steps + modification_data = { + "max_ret": max_ret, + "min_ret": min_ret, + "max_episode_steps": max_episode_steps, + } + dataset["rewards"] = dataset["rewards"] * reward_scale + reward_bias + return modification_data + + +def modify_reward_online( + reward: float, + env_name: str, + reward_scale: float = 1.0, + reward_bias: float = 0.0, + **kwargs, +) -> float: + if any(s in env_name for s in ("halfcheetah", "hopper", "walker2d")): + reward /= kwargs["max_ret"] - kwargs["min_ret"] + reward *= kwargs["max_episode_steps"] + reward = reward * reward_scale + reward_bias + return reward + + +def extend_and_repeat(tensor: torch.Tensor, dim: int, repeat: int) -> torch.Tensor: + return tensor.unsqueeze(dim).repeat_interleave(repeat, dim=dim) + + +def init_module_weights(module: torch.nn.Module, orthogonal_init: bool = False): + if isinstance(module, nn.Linear): + if orthogonal_init: + nn.init.orthogonal_(module.weight, gain=np.sqrt(2)) + nn.init.constant_(module.bias, 0.0) + else: + nn.init.xavier_uniform_(module.weight, gain=1e-2) + + +class ReparameterizedTanhGaussian(nn.Module): + def __init__( + self, + log_std_min: float = -20.0, + log_std_max: float = 2.0, + no_tanh: bool = False, + ): + super().__init__() + self.log_std_min = log_std_min + self.log_std_max = log_std_max + self.no_tanh = no_tanh + + def log_prob( + self, mean: torch.Tensor, log_std: torch.Tensor, sample: torch.Tensor + ) -> torch.Tensor: + log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max) + std = torch.exp(log_std) + if self.no_tanh: + action_distribution = Normal(mean, std) + else: + action_distribution = TransformedDistribution( + Normal(mean, std), TanhTransform(cache_size=1) + ) + return torch.sum(action_distribution.log_prob(sample), dim=-1) + + def forward( + self, + mean: torch.Tensor, + log_std: torch.Tensor, + deterministic: bool = False, + ) -> Tuple[torch.Tensor, torch.Tensor]: + log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max) + std = torch.exp(log_std) + + if self.no_tanh: + action_distribution = Normal(mean, std) + else: + action_distribution = TransformedDistribution( + Normal(mean, std), TanhTransform(cache_size=1) + ) + + if deterministic: + action_sample = torch.tanh(mean) + else: + action_sample = action_distribution.rsample() + + log_prob = torch.sum(action_distribution.log_prob(action_sample), dim=-1) + + return action_sample, log_prob + + +class TanhGaussianPolicy(nn.Module): + def __init__( + self, + state_dim: int, + action_dim: int, + max_action: float, + log_std_multiplier: float = 1.0, + log_std_offset: float = -1.0, + orthogonal_init: bool = False, + no_tanh: bool = False, + ): + super().__init__() + self.observation_dim = state_dim + self.action_dim = action_dim + self.max_action = max_action + self.orthogonal_init = orthogonal_init + self.no_tanh = no_tanh + + self.base_network = nn.Sequential( + nn.Linear(state_dim, 256), + nn.ReLU(), + nn.Linear(256, 256), + nn.ReLU(), + nn.Linear(256, 256), + nn.ReLU(), + nn.Linear(256, 2 * action_dim), + ) + + if orthogonal_init: + self.base_network.apply(lambda m: init_module_weights(m, True)) + else: + init_module_weights(self.base_network[-1], False) + + self.log_std_multiplier = Scalar(log_std_multiplier) + self.log_std_offset = Scalar(log_std_offset) + self.tanh_gaussian = ReparameterizedTanhGaussian(no_tanh=no_tanh) + + def log_prob( + self, observations: torch.Tensor, actions: torch.Tensor + ) -> torch.Tensor: + if actions.ndim == 3: + observations = extend_and_repeat(observations, 1, actions.shape[1]) + base_network_output = self.base_network(observations) + mean, log_std = torch.split(base_network_output, self.action_dim, dim=-1) + log_std = self.log_std_multiplier() * log_std + self.log_std_offset() + _, log_probs = self.tanh_gaussian(mean, log_std, False) + return log_probs + + def forward( + self, + observations: torch.Tensor, + deterministic: bool = False, + repeat: bool = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + if repeat is not None: + observations = extend_and_repeat(observations, 1, repeat) + base_network_output = self.base_network(observations) + mean, log_std = torch.split(base_network_output, self.action_dim, dim=-1) + log_std = self.log_std_multiplier() * log_std + self.log_std_offset() + actions, log_probs = self.tanh_gaussian(mean, log_std, deterministic) + return self.max_action * actions, log_probs + + @torch.no_grad() + def act(self, state: np.ndarray, device: str = "cpu"): + state = torch.tensor(state.reshape(1, -1), device=device, dtype=torch.float32) + with torch.no_grad(): + actions, _ = self(state, not self.training) + return actions.cpu().data.numpy().flatten() + + +class FullyConnectedQFunction(nn.Module): + def __init__( + self, + observation_dim: int, + action_dim: int, + orthogonal_init: bool = False, + n_hidden_layers: int = 2, + ): + super().__init__() + self.observation_dim = observation_dim + self.action_dim = action_dim + self.orthogonal_init = orthogonal_init + + layers = [ + nn.Linear(observation_dim + action_dim, 256), + nn.ReLU(), + ] + for _ in range(n_hidden_layers - 1): + layers.append(nn.Linear(256, 256)) + layers.append(nn.ReLU()) + layers.append(nn.Linear(256, 1)) + + self.network = nn.Sequential(*layers) + if orthogonal_init: + self.network.apply(lambda m: init_module_weights(m, True)) + else: + init_module_weights(self.network[-1], False) + + def forward(self, observations: torch.Tensor, actions: torch.Tensor) -> torch.Tensor: + multiple_actions = False + batch_size = observations.shape[0] + if actions.ndim == 3 and observations.ndim == 2: + multiple_actions = True + observations = extend_and_repeat(observations, 1, actions.shape[1]).reshape( + -1, observations.shape[-1] + ) + actions = actions.reshape(-1, actions.shape[-1]) + input_tensor = torch.cat([observations, actions], dim=-1) + q_values = torch.squeeze(self.network(input_tensor), dim=-1) + if multiple_actions: + q_values = q_values.reshape(batch_size, -1) + return q_values + + +class Scalar(nn.Module): + def __init__(self, init_value: float): + super().__init__() + self.constant = nn.Parameter(torch.tensor(init_value, dtype=torch.float32)) + + def forward(self) -> nn.Parameter: + return self.constant + + +class CalQL: + def __init__( + self, + critic_1, + critic_1_optimizer, + critic_2, + critic_2_optimizer, + actor, + actor_optimizer, + target_entropy: float, + discount: float = 0.99, + alpha_multiplier: float = 1.0, + use_automatic_entropy_tuning: bool = True, + backup_entropy: bool = False, + policy_lr: bool = 3e-4, + qf_lr: bool = 3e-4, + soft_target_update_rate: float = 5e-3, + bc_steps=100000, + target_update_period: int = 1, + cql_n_actions: int = 10, + cql_importance_sample: bool = True, + cql_lagrange: bool = False, + cql_target_action_gap: float = -1.0, + cql_temp: float = 1.0, + cql_alpha: float = 5.0, + cql_max_target_backup: bool = False, + cql_clip_diff_min: float = -np.inf, + cql_clip_diff_max: float = np.inf, + device: str = "cpu", + ): + super().__init__() + + self.discount = discount + self.target_entropy = target_entropy + self.alpha_multiplier = alpha_multiplier + self.use_automatic_entropy_tuning = use_automatic_entropy_tuning + self.backup_entropy = backup_entropy + self.policy_lr = policy_lr + self.qf_lr = qf_lr + self.soft_target_update_rate = soft_target_update_rate + self.bc_steps = bc_steps + self.target_update_period = target_update_period + self.cql_n_actions = cql_n_actions + self.cql_importance_sample = cql_importance_sample + self.cql_lagrange = cql_lagrange + self.cql_target_action_gap = cql_target_action_gap + self.cql_temp = cql_temp + self.cql_alpha = cql_alpha + self.cql_max_target_backup = cql_max_target_backup + self.cql_clip_diff_min = cql_clip_diff_min + self.cql_clip_diff_max = cql_clip_diff_max + self._device = device + + self.total_it = 0 + + self.critic_1 = critic_1 + self.critic_2 = critic_2 + + self.target_critic_1 = deepcopy(self.critic_1).to(device) + self.target_critic_2 = deepcopy(self.critic_2).to(device) + + self.actor = actor + + self.actor_optimizer = actor_optimizer + self.critic_1_optimizer = critic_1_optimizer + self.critic_2_optimizer = critic_2_optimizer + + if self.use_automatic_entropy_tuning: + self.log_alpha = Scalar(0.0) + self.alpha_optimizer = torch.optim.Adam( + self.log_alpha.parameters(), + lr=self.policy_lr, + ) + else: + self.log_alpha = None + + self.log_alpha_prime = Scalar(1.0) + self.alpha_prime_optimizer = torch.optim.Adam( + self.log_alpha_prime.parameters(), + lr=self.qf_lr, + ) + + self._calibration_enabled = True + self.total_it = 0 + + def update_target_network(self, soft_target_update_rate: float): + soft_update(self.target_critic_1, self.critic_1, soft_target_update_rate) + soft_update(self.target_critic_2, self.critic_2, soft_target_update_rate) + + def switch_calibration(self): + self._calibration_enabled = not self._calibration_enabled + + def _alpha_and_alpha_loss(self, observations: torch.Tensor, log_pi: torch.Tensor): + if self.use_automatic_entropy_tuning: + alpha_loss = -( + self.log_alpha() * (log_pi + self.target_entropy).detach() + ).mean() + alpha = self.log_alpha().exp() * self.alpha_multiplier + else: + alpha_loss = observations.new_tensor(0.0) + alpha = observations.new_tensor(self.alpha_multiplier) + return alpha, alpha_loss + + def _policy_loss( + self, + observations: torch.Tensor, + actions: torch.Tensor, + new_actions: torch.Tensor, + alpha: torch.Tensor, + log_pi: torch.Tensor, + ) -> torch.Tensor: + if self.total_it <= self.bc_steps: + log_probs = self.actor.log_prob(observations, actions) + policy_loss = (alpha * log_pi - log_probs).mean() + else: + q_new_actions = torch.min( + self.critic_1(observations, new_actions), + self.critic_2(observations, new_actions), + ) + policy_loss = (alpha * log_pi - q_new_actions).mean() + return policy_loss + + def _q_loss( + self, + observations: torch.Tensor, + actions: torch.Tensor, + next_observations: torch.Tensor, + rewards: torch.Tensor, + dones: torch.Tensor, + mc_returns: torch.Tensor, + alpha: torch.Tensor, + log_dict: Dict, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + q1_predicted = self.critic_1(observations, actions) + q2_predicted = self.critic_2(observations, actions) + + if self.cql_max_target_backup: + new_next_actions, next_log_pi = self.actor( + next_observations, repeat=self.cql_n_actions + ) + target_q_values, max_target_indices = torch.max( + torch.min( + self.target_critic_1(next_observations, new_next_actions), + self.target_critic_2(next_observations, new_next_actions), + ), + dim=-1, + ) + next_log_pi = torch.gather( + next_log_pi, -1, max_target_indices.unsqueeze(-1) + ).squeeze(-1) + else: + new_next_actions, next_log_pi = self.actor(next_observations) + target_q_values = torch.min( + self.target_critic_1(next_observations, new_next_actions), + self.target_critic_2(next_observations, new_next_actions), + ) + + if self.backup_entropy: + target_q_values = target_q_values - alpha * next_log_pi + + target_q_values = target_q_values.unsqueeze(-1) + td_target = rewards + (1.0 - dones) * self.discount * target_q_values.detach() + td_target = td_target.squeeze(-1) + qf1_loss = F.mse_loss(q1_predicted, td_target.detach()) + qf2_loss = F.mse_loss(q2_predicted, td_target.detach()) + + # CQL + batch_size = actions.shape[0] + action_dim = actions.shape[-1] + cql_random_actions = actions.new_empty( + (batch_size, self.cql_n_actions, action_dim), requires_grad=False + ).uniform_(-1, 1) + cql_current_actions, cql_current_log_pis = self.actor( + observations, repeat=self.cql_n_actions + ) + cql_next_actions, cql_next_log_pis = self.actor( + next_observations, repeat=self.cql_n_actions + ) + cql_current_actions, cql_current_log_pis = ( + cql_current_actions.detach(), + cql_current_log_pis.detach(), + ) + cql_next_actions, cql_next_log_pis = ( + cql_next_actions.detach(), + cql_next_log_pis.detach(), + ) + + cql_q1_rand = self.critic_1(observations, cql_random_actions) + cql_q2_rand = self.critic_2(observations, cql_random_actions) + cql_q1_current_actions = self.critic_1(observations, cql_current_actions) + cql_q2_current_actions = self.critic_2(observations, cql_current_actions) + cql_q1_next_actions = self.critic_1(observations, cql_next_actions) + cql_q2_next_actions = self.critic_2(observations, cql_next_actions) + + # Calibration + lower_bounds = mc_returns.reshape(-1, 1).repeat( + 1, cql_q1_current_actions.shape[1] + ) + + num_vals = torch.sum(lower_bounds == lower_bounds) + bound_rate_cql_q1_current_actions = ( + torch.sum(cql_q1_current_actions < lower_bounds) / num_vals + ) + bound_rate_cql_q2_current_actions = ( + torch.sum(cql_q2_current_actions < lower_bounds) / num_vals + ) + bound_rate_cql_q1_next_actions = ( + torch.sum(cql_q1_next_actions < lower_bounds) / num_vals + ) + bound_rate_cql_q2_next_actions = ( + torch.sum(cql_q2_next_actions < lower_bounds) / num_vals + ) + + """ Cal-QL: bound Q-values with MC return-to-go """ + if self._calibration_enabled: + cql_q1_current_actions = torch.maximum(cql_q1_current_actions, lower_bounds) + cql_q2_current_actions = torch.maximum(cql_q2_current_actions, lower_bounds) + cql_q1_next_actions = torch.maximum(cql_q1_next_actions, lower_bounds) + cql_q2_next_actions = torch.maximum(cql_q2_next_actions, lower_bounds) + + cql_cat_q1 = torch.cat( + [ + cql_q1_rand, + torch.unsqueeze(q1_predicted, 1), + cql_q1_next_actions, + cql_q1_current_actions, + ], + dim=1, + ) + cql_cat_q2 = torch.cat( + [ + cql_q2_rand, + torch.unsqueeze(q2_predicted, 1), + cql_q2_next_actions, + cql_q2_current_actions, + ], + dim=1, + ) + cql_std_q1 = torch.std(cql_cat_q1, dim=1) + cql_std_q2 = torch.std(cql_cat_q2, dim=1) + + if self.cql_importance_sample: + random_density = np.log(0.5**action_dim) + cql_cat_q1 = torch.cat( + [ + cql_q1_rand - random_density, + cql_q1_next_actions - cql_next_log_pis.detach(), + cql_q1_current_actions - cql_current_log_pis.detach(), + ], + dim=1, + ) + cql_cat_q2 = torch.cat( + [ + cql_q2_rand - random_density, + cql_q2_next_actions - cql_next_log_pis.detach(), + cql_q2_current_actions - cql_current_log_pis.detach(), + ], + dim=1, + ) + + cql_qf1_ood = torch.logsumexp(cql_cat_q1 / self.cql_temp, dim=1) * self.cql_temp + cql_qf2_ood = torch.logsumexp(cql_cat_q2 / self.cql_temp, dim=1) * self.cql_temp + + """Subtract the log likelihood of data""" + cql_qf1_diff = torch.clamp( + cql_qf1_ood - q1_predicted, + self.cql_clip_diff_min, + self.cql_clip_diff_max, + ).mean() + cql_qf2_diff = torch.clamp( + cql_qf2_ood - q2_predicted, + self.cql_clip_diff_min, + self.cql_clip_diff_max, + ).mean() + + if self.cql_lagrange: + alpha_prime = torch.clamp( + torch.exp(self.log_alpha_prime()), min=0.0, max=1000000.0 + ) + cql_min_qf1_loss = ( + alpha_prime + * self.cql_alpha + * (cql_qf1_diff - self.cql_target_action_gap) + ) + cql_min_qf2_loss = ( + alpha_prime + * self.cql_alpha + * (cql_qf2_diff - self.cql_target_action_gap) + ) + + self.alpha_prime_optimizer.zero_grad() + alpha_prime_loss = (-cql_min_qf1_loss - cql_min_qf2_loss) * 0.5 + alpha_prime_loss.backward(retain_graph=True) + self.alpha_prime_optimizer.step() + else: + cql_min_qf1_loss = cql_qf1_diff * self.cql_alpha + cql_min_qf2_loss = cql_qf2_diff * self.cql_alpha + alpha_prime_loss = observations.new_tensor(0.0) + alpha_prime = observations.new_tensor(0.0) + + qf_loss = qf1_loss + qf2_loss + cql_min_qf1_loss + cql_min_qf2_loss + + log_dict.update( + dict( + qf1_loss=qf1_loss.item(), + qf2_loss=qf2_loss.item(), + alpha=alpha.item(), + average_qf1=q1_predicted.mean().item(), + average_qf2=q2_predicted.mean().item(), + average_target_q=target_q_values.mean().item(), + ) + ) + + log_dict.update( + dict( + cql_std_q1=cql_std_q1.mean().item(), + cql_std_q2=cql_std_q2.mean().item(), + cql_q1_rand=cql_q1_rand.mean().item(), + cql_q2_rand=cql_q2_rand.mean().item(), + cql_min_qf1_loss=cql_min_qf1_loss.mean().item(), + cql_min_qf2_loss=cql_min_qf2_loss.mean().item(), + cql_qf1_diff=cql_qf1_diff.mean().item(), + cql_qf2_diff=cql_qf2_diff.mean().item(), + cql_q1_current_actions=cql_q1_current_actions.mean().item(), + cql_q2_current_actions=cql_q2_current_actions.mean().item(), + cql_q1_next_actions=cql_q1_next_actions.mean().item(), + cql_q2_next_actions=cql_q2_next_actions.mean().item(), + alpha_prime_loss=alpha_prime_loss.item(), + alpha_prime=alpha_prime.item(), + bound_rate_cql_q1_current_actions=bound_rate_cql_q1_current_actions.item(), # noqa + bound_rate_cql_q2_current_actions=bound_rate_cql_q2_current_actions.item(), # noqa + bound_rate_cql_q1_next_actions=bound_rate_cql_q1_next_actions.item(), + bound_rate_cql_q2_next_actions=bound_rate_cql_q2_next_actions.item(), + ) + ) + + return qf_loss, alpha_prime, alpha_prime_loss + + def train(self, batch: TensorBatch) -> Dict[str, float]: + ( + observations, + actions, + rewards, + next_observations, + dones, + mc_returns, + ) = batch + self.total_it += 1 + + new_actions, log_pi = self.actor(observations) + + alpha, alpha_loss = self._alpha_and_alpha_loss(observations, log_pi) + + """ Policy loss """ + policy_loss = self._policy_loss( + observations, actions, new_actions, alpha, log_pi + ) + + log_dict = dict( + log_pi=log_pi.mean().item(), + policy_loss=policy_loss.item(), + alpha_loss=alpha_loss.item(), + alpha=alpha.item(), + ) + + """ Q function loss """ + qf_loss, alpha_prime, alpha_prime_loss = self._q_loss( + observations, + actions, + next_observations, + rewards, + dones, + mc_returns, + alpha, + log_dict, + ) + + if self.use_automatic_entropy_tuning: + self.alpha_optimizer.zero_grad() + alpha_loss.backward() + self.alpha_optimizer.step() + + self.actor_optimizer.zero_grad() + policy_loss.backward() + self.actor_optimizer.step() + + self.critic_1_optimizer.zero_grad() + self.critic_2_optimizer.zero_grad() + qf_loss.backward(retain_graph=True) + self.critic_1_optimizer.step() + self.critic_2_optimizer.step() + + if self.total_it % self.target_update_period == 0: + self.update_target_network(self.soft_target_update_rate) + + return log_dict + + def state_dict(self) -> Dict[str, Any]: + return { + "actor": self.actor.state_dict(), + "critic1": self.critic_1.state_dict(), + "critic2": self.critic_2.state_dict(), + "critic1_target": self.target_critic_1.state_dict(), + "critic2_target": self.target_critic_2.state_dict(), + "critic_1_optimizer": self.critic_1_optimizer.state_dict(), + "critic_2_optimizer": self.critic_2_optimizer.state_dict(), + "actor_optim": self.actor_optimizer.state_dict(), + "sac_log_alpha": self.log_alpha, + "sac_log_alpha_optim": self.alpha_optimizer.state_dict(), + "cql_log_alpha": self.log_alpha_prime, + "cql_log_alpha_optim": self.alpha_prime_optimizer.state_dict(), + "total_it": self.total_it, + } + + def load_state_dict(self, state_dict: Dict[str, Any]): + self.actor.load_state_dict(state_dict=state_dict["actor"]) + self.critic_1.load_state_dict(state_dict=state_dict["critic1"]) + self.critic_2.load_state_dict(state_dict=state_dict["critic2"]) + + self.target_critic_1.load_state_dict(state_dict=state_dict["critic1_target"]) + self.target_critic_2.load_state_dict(state_dict=state_dict["critic2_target"]) + + self.critic_1_optimizer.load_state_dict( + state_dict=state_dict["critic_1_optimizer"] + ) + self.critic_2_optimizer.load_state_dict( + state_dict=state_dict["critic_2_optimizer"] + ) + self.actor_optimizer.load_state_dict(state_dict=state_dict["actor_optim"]) + + self.log_alpha = state_dict["sac_log_alpha"] + self.alpha_optimizer.load_state_dict( + state_dict=state_dict["sac_log_alpha_optim"] + ) + + self.log_alpha_prime = state_dict["cql_log_alpha"] + self.alpha_prime_optimizer.load_state_dict( + state_dict=state_dict["cql_log_alpha_optim"] + ) + self.total_it = state_dict["total_it"] + + +@pyrallis.wrap() +def train(config: TrainConfig): + env = gym.make(config.env) + eval_env = gym.make(config.env) + + is_env_with_goal = config.env.startswith(ENVS_WITH_GOAL) + batch_size_offline = int(config.batch_size * config.mixing_ratio) + batch_size_online = config.batch_size - batch_size_offline + + max_steps = env._max_episode_steps + + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.shape[0] + + dataset = d4rl.qlearning_dataset(env) + + reward_mod_dict = {} + if config.normalize_reward: + reward_mod_dict = modify_reward( + dataset, + config.env, + reward_scale=config.reward_scale, + reward_bias=config.reward_bias, + ) + mc_returns = get_return_to_go(dataset, env, config) + dataset["mc_returns"] = np.array(mc_returns) + assert len(dataset["mc_returns"]) == len(dataset["rewards"]) + + if config.normalize: + state_mean, state_std = compute_mean_std(dataset["observations"], eps=1e-3) + else: + state_mean, state_std = 0, 1 + + dataset["observations"] = normalize_states( + dataset["observations"], state_mean, state_std + ) + dataset["next_observations"] = normalize_states( + dataset["next_observations"], state_mean, state_std + ) + env = wrap_env(env, state_mean=state_mean, state_std=state_std) + eval_env = wrap_env(eval_env, state_mean=state_mean, state_std=state_std) + offline_buffer = ReplayBuffer( + state_dim, + action_dim, + config.buffer_size, + config.device, + ) + online_buffer = ReplayBuffer( + state_dim, + action_dim, + config.buffer_size, + config.device, + ) + offline_buffer.load_d4rl_dataset(dataset) + + max_action = float(env.action_space.high[0]) + + if config.checkpoints_path is not None: + print(f"Checkpoints path: {config.checkpoints_path}") + os.makedirs(config.checkpoints_path, exist_ok=True) + with open(os.path.join(config.checkpoints_path, "config.yaml"), "w") as f: + pyrallis.dump(config, f) + + # Set seeds + seed = config.seed + set_seed(seed, env) + set_env_seed(eval_env, config.eval_seed) + + critic_1 = FullyConnectedQFunction( + state_dim, + action_dim, + config.orthogonal_init, + config.q_n_hidden_layers, + ).to(config.device) + critic_2 = FullyConnectedQFunction( + state_dim, + action_dim, + config.orthogonal_init, + config.q_n_hidden_layers, + ).to(config.device) + critic_1_optimizer = torch.optim.Adam(list(critic_1.parameters()), config.qf_lr) + critic_2_optimizer = torch.optim.Adam(list(critic_2.parameters()), config.qf_lr) + + actor = TanhGaussianPolicy( + state_dim, + action_dim, + max_action, + orthogonal_init=config.orthogonal_init, + ).to(config.device) + actor_optimizer = torch.optim.Adam(actor.parameters(), config.policy_lr) + + kwargs = { + "critic_1": critic_1, + "critic_2": critic_2, + "critic_1_optimizer": critic_1_optimizer, + "critic_2_optimizer": critic_2_optimizer, + "actor": actor, + "actor_optimizer": actor_optimizer, + "discount": config.discount, + "soft_target_update_rate": config.soft_target_update_rate, + "device": config.device, + # CQL + "target_entropy": -np.prod(env.action_space.shape).item(), + "alpha_multiplier": config.alpha_multiplier, + "use_automatic_entropy_tuning": config.use_automatic_entropy_tuning, + "backup_entropy": config.backup_entropy, + "policy_lr": config.policy_lr, + "qf_lr": config.qf_lr, + "bc_steps": config.bc_steps, + "target_update_period": config.target_update_period, + "cql_n_actions": config.cql_n_actions, + "cql_importance_sample": config.cql_importance_sample, + "cql_lagrange": config.cql_lagrange, + "cql_target_action_gap": config.cql_target_action_gap, + "cql_temp": config.cql_temp, + "cql_alpha": config.cql_alpha, + "cql_max_target_backup": config.cql_max_target_backup, + "cql_clip_diff_min": config.cql_clip_diff_min, + "cql_clip_diff_max": config.cql_clip_diff_max, + } + + print("---------------------------------------") + print(f"Training Cal-QL, Env: {config.env}, Seed: {seed}") + print("---------------------------------------") + + # Initialize actor + trainer = CalQL(**kwargs) + + if config.load_model != "": + policy_file = Path(config.load_model) + trainer.load_state_dict(torch.load(policy_file)) + actor = trainer.actor + + wandb_init(asdict(config)) + + evaluations = [] + state, done = env.reset(), False + episode_return = 0 + episode_step = 0 + goal_achieved = False + + eval_successes = [] + train_successes = [] + + print("Offline pretraining") + for t in range(int(config.offline_iterations) + int(config.online_iterations)): + if t == config.offline_iterations: + print("Online tuning") + trainer.switch_calibration() + trainer.cql_alpha = config.cql_alpha_online + online_log = {} + if t >= config.offline_iterations: + episode_step += 1 + action, _ = actor( + torch.tensor( + state.reshape(1, -1), + device=config.device, + dtype=torch.float32, + ) + ) + action = action.cpu().data.numpy().flatten() + next_state, reward, done, env_infos = env.step(action) + + if not goal_achieved: + goal_achieved = is_goal_reached(reward, env_infos) + episode_return += reward + real_done = False # Episode can timeout which is different from done + if done and episode_step < max_steps: + real_done = True + + if config.normalize_reward: + reward = modify_reward_online( + reward, + config.env, + reward_scale=config.reward_scale, + reward_bias=config.reward_bias, + **reward_mod_dict, + ) + online_buffer.add_transition(state, action, reward, next_state, real_done) + state = next_state + + if done: + state, done = env.reset(), False + # Valid only for envs with goal, e.g. AntMaze, Adroit + if is_env_with_goal: + train_successes.append(goal_achieved) + online_log["train/regret"] = np.mean(1 - np.array(train_successes)) + online_log["train/is_success"] = float(goal_achieved) + online_log["train/episode_return"] = episode_return + normalized_return = eval_env.get_normalized_score(episode_return) + online_log["train/d4rl_normalized_episode_return"] = ( + normalized_return * 100.0 + ) + online_log["train/episode_length"] = episode_step + episode_return = 0 + episode_step = 0 + goal_achieved = False + + if t < config.offline_iterations: + batch = offline_buffer.sample(config.batch_size) + batch = [b.to(config.device) for b in batch] + else: + offline_batch = offline_buffer.sample(batch_size_offline) + online_batch = online_buffer.sample(batch_size_online) + batch = [ + torch.vstack(tuple(b)).to(config.device) + for b in zip(offline_batch, online_batch) + ] + + log_dict = trainer.train(batch) + log_dict["offline_iter" if t < config.offline_iterations else "online_iter"] = ( + t if t < config.offline_iterations else t - config.offline_iterations + ) + log_dict.update(online_log) + wandb.log(log_dict, step=trainer.total_it) + # Evaluate episode + if (t + 1) % config.eval_freq == 0: + print(f"Time steps: {t + 1}") + eval_scores, success_rate = eval_actor( + eval_env, + actor, + device=config.device, + n_episodes=config.n_episodes, + seed=config.seed, + ) + eval_score = eval_scores.mean() + eval_log = {} + normalized = eval_env.get_normalized_score(np.mean(eval_scores)) + # Valid only for envs with goal, e.g. AntMaze, Adroit + if t >= config.offline_iterations and is_env_with_goal: + eval_successes.append(success_rate) + eval_log["eval/regret"] = np.mean(1 - np.array(train_successes)) + eval_log["eval/success_rate"] = success_rate + normalized_eval_score = normalized * 100.0 + eval_log["eval/d4rl_normalized_score"] = normalized_eval_score + evaluations.append(normalized_eval_score) + print("---------------------------------------") + print( + f"Evaluation over {config.n_episodes} episodes: " + f"{eval_score:.3f} , D4RL score: {normalized_eval_score:.3f}" + ) + print("---------------------------------------") + if config.checkpoints_path: + torch.save( + trainer.state_dict(), + os.path.join(config.checkpoints_path, f"checkpoint_{t}.pt"), + ) + wandb.log(eval_log, step=trainer.total_it) + + +if __name__ == "__main__": + train() diff --git a/algorithms/finetune/cql.py b/algorithms/finetune/cql.py new file mode 100644 index 00000000..867c7a5a --- /dev/null +++ b/algorithms/finetune/cql.py @@ -0,0 +1,1107 @@ +# source: https://github.com/young-geng/CQL/tree/934b0e8354ca431d6c083c4e3a29df88d4b0a24d +# STRONG UNDER-PERFORMANCE ON PART OF ANTMAZE TASKS. BUT IN IQL PAPER IT WORKS SOMEHOW +# https://arxiv.org/pdf/2006.04779.pdf +import os +import random +import uuid +from copy import deepcopy +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union + +import d4rl +import gym +import numpy as np +import pyrallis +import torch +import torch.nn as nn +import torch.nn.functional as F +import wandb +from torch.distributions import Normal, TanhTransform, TransformedDistribution + +TensorBatch = List[torch.Tensor] + +ENVS_WITH_GOAL = ("antmaze", "pen", "door", "hammer", "relocate") + + +@dataclass +class TrainConfig: + # Experiment + device: str = "cuda" + env: str = "halfcheetah-medium-expert-v2" # OpenAI gym environment name + seed: int = 0 # Sets Gym, PyTorch and Numpy seeds + eval_seed: int = 0 # Eval environment seed + eval_freq: int = int(5e3) # How often (time steps) we evaluate + n_episodes: int = 10 # How many episodes run during evaluation + offline_iterations: int = int(1e6) # Number of offline updates + online_iterations: int = int(1e6) # Number of online updates + checkpoints_path: Optional[str] = None # Save path + load_model: str = "" # Model load file name, "" doesn't load + # CQL + buffer_size: int = 2_000_000 # Replay buffer size + batch_size: int = 256 # Batch size for all networks + discount: float = 0.99 # Discount factor + alpha_multiplier: float = 1.0 # Multiplier for alpha in loss + use_automatic_entropy_tuning: bool = True # Tune entropy + backup_entropy: bool = False # Use backup entropy + policy_lr: float = 3e-5 # Policy learning rate + qf_lr: float = 3e-4 # Critics learning rate + soft_target_update_rate: float = 5e-3 # Target network update rate + bc_steps: int = int(0) # Number of BC steps at start + target_update_period: int = 1 # Frequency of target nets updates + cql_alpha: float = 10.0 # CQL offline regularization parameter + cql_alpha_online: float = 10.0 # CQL online regularization parameter + cql_n_actions: int = 10 # Number of sampled actions + cql_importance_sample: bool = True # Use importance sampling + cql_lagrange: bool = False # Use Lagrange version of CQL + cql_target_action_gap: float = -1.0 # Action gap + cql_temp: float = 1.0 # CQL temperature + cql_max_target_backup: bool = False # Use max target backup + cql_clip_diff_min: float = -np.inf # Q-function lower loss clipping + cql_clip_diff_max: float = np.inf # Q-function upper loss clipping + orthogonal_init: bool = True # Orthogonal initialization + normalize: bool = True # Normalize states + normalize_reward: bool = False # Normalize reward + q_n_hidden_layers: int = 2 # Number of hidden layers in Q networks + reward_scale: float = 1.0 # Reward scale for normalization + reward_bias: float = 0.0 # Reward bias for normalization + # Wandb logging + project: str = "CORL" + group: str = "CQL-D4RL" + name: str = "CQL" + + def __post_init__(self): + self.name = f"{self.name}-{self.env}-{str(uuid.uuid4())[:8]}" + if self.checkpoints_path is not None: + self.checkpoints_path = os.path.join(self.checkpoints_path, self.name) + + +def soft_update(target: nn.Module, source: nn.Module, tau: float): + for target_param, source_param in zip(target.parameters(), source.parameters()): + target_param.data.copy_((1 - tau) * target_param.data + tau * source_param.data) + + +def compute_mean_std(states: np.ndarray, eps: float) -> Tuple[np.ndarray, np.ndarray]: + mean = states.mean(0) + std = states.std(0) + eps + return mean, std + + +def normalize_states(states: np.ndarray, mean: np.ndarray, std: np.ndarray): + return (states - mean) / std + + +def wrap_env( + env: gym.Env, + state_mean: Union[np.ndarray, float] = 0.0, + state_std: Union[np.ndarray, float] = 1.0, + reward_scale: float = 1.0, +) -> gym.Env: + # PEP 8: E731 do not assign a lambda expression, use a def + def normalize_state(state): + return ( + state - state_mean + ) / state_std # epsilon should be already added in std. + + def scale_reward(reward): + # Please be careful, here reward is multiplied by scale! + return reward_scale * reward + + env = gym.wrappers.TransformObservation(env, normalize_state) + if reward_scale != 1.0: + env = gym.wrappers.TransformReward(env, scale_reward) + return env + + +class ReplayBuffer: + def __init__( + self, + state_dim: int, + action_dim: int, + buffer_size: int, + device: str = "cpu", + ): + self._buffer_size = buffer_size + self._pointer = 0 + self._size = 0 + + self._states = torch.zeros( + (buffer_size, state_dim), dtype=torch.float32, device=device + ) + self._actions = torch.zeros( + (buffer_size, action_dim), dtype=torch.float32, device=device + ) + self._rewards = torch.zeros((buffer_size, 1), dtype=torch.float32, device=device) + self._next_states = torch.zeros( + (buffer_size, state_dim), dtype=torch.float32, device=device + ) + self._dones = torch.zeros((buffer_size, 1), dtype=torch.float32, device=device) + self._device = device + + def _to_tensor(self, data: np.ndarray) -> torch.Tensor: + return torch.tensor(data, dtype=torch.float32, device=self._device) + + # Loads data in d4rl format, i.e. from Dict[str, np.array]. + def load_d4rl_dataset(self, data: Dict[str, np.ndarray]): + if self._size != 0: + raise ValueError("Trying to load data into non-empty replay buffer") + n_transitions = data["observations"].shape[0] + if n_transitions > self._buffer_size: + raise ValueError( + "Replay buffer is smaller than the dataset you are trying to load!" + ) + self._states[:n_transitions] = self._to_tensor(data["observations"]) + self._actions[:n_transitions] = self._to_tensor(data["actions"]) + self._rewards[:n_transitions] = self._to_tensor(data["rewards"][..., None]) + self._next_states[:n_transitions] = self._to_tensor(data["next_observations"]) + self._dones[:n_transitions] = self._to_tensor(data["terminals"][..., None]) + self._size += n_transitions + self._pointer = min(self._size, n_transitions) + + print(f"Dataset size: {n_transitions}") + + def sample(self, batch_size: int) -> TensorBatch: + indices = np.random.randint(0, self._size, size=batch_size) + states = self._states[indices] + actions = self._actions[indices] + rewards = self._rewards[indices] + next_states = self._next_states[indices] + dones = self._dones[indices] + return [states, actions, rewards, next_states, dones] + + def add_transition( + self, + state: np.ndarray, + action: np.ndarray, + reward: float, + next_state: np.ndarray, + done: bool, + ): + # Use this method to add new data into the replay buffer during fine-tuning. + self._states[self._pointer] = self._to_tensor(state) + self._actions[self._pointer] = self._to_tensor(action) + self._rewards[self._pointer] = self._to_tensor(reward) + self._next_states[self._pointer] = self._to_tensor(next_state) + self._dones[self._pointer] = self._to_tensor(done) + + self._pointer = (self._pointer + 1) % self._buffer_size + self._size = min(self._size + 1, self._buffer_size) + + +def set_env_seed(env: Optional[gym.Env], seed: int): + env.seed(seed) + env.action_space.seed(seed) + + +def set_seed( + seed: int, env: Optional[gym.Env] = None, deterministic_torch: bool = False +): + if env is not None: + set_env_seed(env, seed) + os.environ["PYTHONHASHSEED"] = str(seed) + np.random.seed(seed) + random.seed(seed) + torch.manual_seed(seed) + torch.use_deterministic_algorithms(deterministic_torch) + + +def wandb_init(config: dict) -> None: + wandb.init( + config=config, + project=config["project"], + group=config["group"], + name=config["name"], + id=str(uuid.uuid4()), + ) + wandb.run.save() + + +def is_goal_reached(reward: float, info: Dict) -> bool: + if "goal_achieved" in info: + return info["goal_achieved"] + return reward > 0 # Assuming that reaching target is a positive reward + + +@torch.no_grad() +def eval_actor( + env: gym.Env, actor: nn.Module, device: str, n_episodes: int, seed: int +) -> Tuple[np.ndarray, np.ndarray]: + env.seed(seed) + actor.eval() + episode_rewards = [] + successes = [] + for _ in range(n_episodes): + state, done = env.reset(), False + episode_reward = 0.0 + goal_achieved = False + while not done: + action = actor.act(state, device) + state, reward, done, env_infos = env.step(action) + episode_reward += reward + if not goal_achieved: + goal_achieved = is_goal_reached(reward, env_infos) + # Valid only for environments with goal + successes.append(float(goal_achieved)) + episode_rewards.append(episode_reward) + + actor.train() + return np.asarray(episode_rewards), np.mean(successes) + + +def return_reward_range(dataset: Dict, max_episode_steps: int) -> Tuple[float, float]: + returns, lengths = [], [] + ep_ret, ep_len = 0.0, 0 + for r, d in zip(dataset["rewards"], dataset["terminals"]): + ep_ret += float(r) + ep_len += 1 + if d or ep_len == max_episode_steps: + returns.append(ep_ret) + lengths.append(ep_len) + ep_ret, ep_len = 0.0, 0 + lengths.append(ep_len) # but still keep track of number of steps + assert sum(lengths) == len(dataset["rewards"]) + return min(returns), max(returns) + + +def modify_reward( + dataset: Dict, + env_name: str, + max_episode_steps: int = 1000, + reward_scale: float = 1.0, + reward_bias: float = 0.0, +) -> Dict: + modification_data = {} + if any(s in env_name for s in ("halfcheetah", "hopper", "walker2d")): + min_ret, max_ret = return_reward_range(dataset, max_episode_steps) + dataset["rewards"] /= max_ret - min_ret + dataset["rewards"] *= max_episode_steps + modification_data = { + "max_ret": max_ret, + "min_ret": min_ret, + "max_episode_steps": max_episode_steps, + } + dataset["rewards"] = dataset["rewards"] * reward_scale + reward_bias + return modification_data + + +def modify_reward_online( + reward: float, + env_name: str, + reward_scale: float = 1.0, + reward_bias: float = 0.0, + **kwargs, +) -> float: + if any(s in env_name for s in ("halfcheetah", "hopper", "walker2d")): + reward /= kwargs["max_ret"] - kwargs["min_ret"] + reward *= kwargs["max_episode_steps"] + reward = reward * reward_scale + reward_bias + return reward + + +def extend_and_repeat(tensor: torch.Tensor, dim: int, repeat: int) -> torch.Tensor: + return tensor.unsqueeze(dim).repeat_interleave(repeat, dim=dim) + + +def init_module_weights(module: torch.nn.Module, orthogonal_init: bool = False): + if isinstance(module, nn.Linear): + if orthogonal_init: + nn.init.orthogonal_(module.weight, gain=np.sqrt(2)) + nn.init.constant_(module.bias, 0.0) + else: + nn.init.xavier_uniform_(module.weight, gain=1e-2) + + +class ReparameterizedTanhGaussian(nn.Module): + def __init__( + self, log_std_min: float = -20.0, log_std_max: float = 2.0, no_tanh: bool = False + ): + super().__init__() + self.log_std_min = log_std_min + self.log_std_max = log_std_max + self.no_tanh = no_tanh + + def log_prob( + self, mean: torch.Tensor, log_std: torch.Tensor, sample: torch.Tensor + ) -> torch.Tensor: + log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max) + std = torch.exp(log_std) + if self.no_tanh: + action_distribution = Normal(mean, std) + else: + action_distribution = TransformedDistribution( + Normal(mean, std), TanhTransform(cache_size=1) + ) + return torch.sum(action_distribution.log_prob(sample), dim=-1) + + def forward( + self, mean: torch.Tensor, log_std: torch.Tensor, deterministic: bool = False + ) -> Tuple[torch.Tensor, torch.Tensor]: + log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max) + std = torch.exp(log_std) + + if self.no_tanh: + action_distribution = Normal(mean, std) + else: + action_distribution = TransformedDistribution( + Normal(mean, std), TanhTransform(cache_size=1) + ) + + if deterministic: + action_sample = torch.tanh(mean) + else: + action_sample = action_distribution.rsample() + + log_prob = torch.sum(action_distribution.log_prob(action_sample), dim=-1) + + return action_sample, log_prob + + +class TanhGaussianPolicy(nn.Module): + def __init__( + self, + state_dim: int, + action_dim: int, + max_action: float, + log_std_multiplier: float = 1.0, + log_std_offset: float = -1.0, + orthogonal_init: bool = False, + no_tanh: bool = False, + ): + super().__init__() + self.observation_dim = state_dim + self.action_dim = action_dim + self.max_action = max_action + self.orthogonal_init = orthogonal_init + self.no_tanh = no_tanh + + self.base_network = nn.Sequential( + nn.Linear(state_dim, 256), + nn.ReLU(), + nn.Linear(256, 256), + nn.ReLU(), + nn.Linear(256, 256), + nn.ReLU(), + nn.Linear(256, 2 * action_dim), + ) + + if orthogonal_init: + self.base_network.apply(lambda m: init_module_weights(m, True)) + else: + init_module_weights(self.base_network[-1], False) + + self.log_std_multiplier = Scalar(log_std_multiplier) + self.log_std_offset = Scalar(log_std_offset) + self.tanh_gaussian = ReparameterizedTanhGaussian(no_tanh=no_tanh) + + def log_prob( + self, observations: torch.Tensor, actions: torch.Tensor + ) -> torch.Tensor: + if actions.ndim == 3: + observations = extend_and_repeat(observations, 1, actions.shape[1]) + base_network_output = self.base_network(observations) + mean, log_std = torch.split(base_network_output, self.action_dim, dim=-1) + log_std = self.log_std_multiplier() * log_std + self.log_std_offset() + _, log_probs = self.tanh_gaussian(mean, log_std, False) + return log_probs + + def forward( + self, + observations: torch.Tensor, + deterministic: bool = False, + repeat: bool = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + if repeat is not None: + observations = extend_and_repeat(observations, 1, repeat) + base_network_output = self.base_network(observations) + mean, log_std = torch.split(base_network_output, self.action_dim, dim=-1) + log_std = self.log_std_multiplier() * log_std + self.log_std_offset() + actions, log_probs = self.tanh_gaussian(mean, log_std, deterministic) + return self.max_action * actions, log_probs + + @torch.no_grad() + def act(self, state: np.ndarray, device: str = "cpu"): + state = torch.tensor(state.reshape(1, -1), device=device, dtype=torch.float32) + with torch.no_grad(): + actions, _ = self(state, not self.training) + return actions.cpu().data.numpy().flatten() + + +class FullyConnectedQFunction(nn.Module): + def __init__( + self, + observation_dim: int, + action_dim: int, + orthogonal_init: bool = False, + n_hidden_layers: int = 2, + ): + super().__init__() + self.observation_dim = observation_dim + self.action_dim = action_dim + self.orthogonal_init = orthogonal_init + + layers = [ + nn.Linear(observation_dim + action_dim, 256), + nn.ReLU(), + ] + for _ in range(n_hidden_layers - 1): + layers.append(nn.Linear(256, 256)) + layers.append(nn.ReLU()) + layers.append(nn.Linear(256, 1)) + + self.network = nn.Sequential(*layers) + + if orthogonal_init: + self.network.apply(lambda m: init_module_weights(m, True)) + else: + init_module_weights(self.network[-1], False) + + def forward(self, observations: torch.Tensor, actions: torch.Tensor) -> torch.Tensor: + multiple_actions = False + batch_size = observations.shape[0] + if actions.ndim == 3 and observations.ndim == 2: + multiple_actions = True + observations = extend_and_repeat(observations, 1, actions.shape[1]).reshape( + -1, observations.shape[-1] + ) + actions = actions.reshape(-1, actions.shape[-1]) + input_tensor = torch.cat([observations, actions], dim=-1) + q_values = torch.squeeze(self.network(input_tensor), dim=-1) + if multiple_actions: + q_values = q_values.reshape(batch_size, -1) + return q_values + + +class Scalar(nn.Module): + def __init__(self, init_value: float): + super().__init__() + self.constant = nn.Parameter(torch.tensor(init_value, dtype=torch.float32)) + + def forward(self) -> nn.Parameter: + return self.constant + + +class ContinuousCQL: + def __init__( + self, + critic_1, + critic_1_optimizer, + critic_2, + critic_2_optimizer, + actor, + actor_optimizer, + target_entropy: float, + discount: float = 0.99, + alpha_multiplier: float = 1.0, + use_automatic_entropy_tuning: bool = True, + backup_entropy: bool = False, + policy_lr: bool = 3e-4, + qf_lr: bool = 3e-4, + soft_target_update_rate: float = 5e-3, + bc_steps=100000, + target_update_period: int = 1, + cql_n_actions: int = 10, + cql_importance_sample: bool = True, + cql_lagrange: bool = False, + cql_target_action_gap: float = -1.0, + cql_temp: float = 1.0, + cql_alpha: float = 5.0, + cql_max_target_backup: bool = False, + cql_clip_diff_min: float = -np.inf, + cql_clip_diff_max: float = np.inf, + device: str = "cpu", + ): + super().__init__() + + self.discount = discount + self.target_entropy = target_entropy + self.alpha_multiplier = alpha_multiplier + self.use_automatic_entropy_tuning = use_automatic_entropy_tuning + self.backup_entropy = backup_entropy + self.policy_lr = policy_lr + self.qf_lr = qf_lr + self.soft_target_update_rate = soft_target_update_rate + self.bc_steps = bc_steps + self.target_update_period = target_update_period + self.cql_n_actions = cql_n_actions + self.cql_importance_sample = cql_importance_sample + self.cql_lagrange = cql_lagrange + self.cql_target_action_gap = cql_target_action_gap + self.cql_temp = cql_temp + self.cql_alpha = cql_alpha + self.cql_max_target_backup = cql_max_target_backup + self.cql_clip_diff_min = cql_clip_diff_min + self.cql_clip_diff_max = cql_clip_diff_max + self._device = device + + self.total_it = 0 + + self.critic_1 = critic_1 + self.critic_2 = critic_2 + + self.target_critic_1 = deepcopy(self.critic_1).to(device) + self.target_critic_2 = deepcopy(self.critic_2).to(device) + + self.actor = actor + + self.actor_optimizer = actor_optimizer + self.critic_1_optimizer = critic_1_optimizer + self.critic_2_optimizer = critic_2_optimizer + + if self.use_automatic_entropy_tuning: + self.log_alpha = Scalar(0.0) + self.alpha_optimizer = torch.optim.Adam( + self.log_alpha.parameters(), + lr=self.policy_lr, + ) + else: + self.log_alpha = None + + self.log_alpha_prime = Scalar(1.0) + self.alpha_prime_optimizer = torch.optim.Adam( + self.log_alpha_prime.parameters(), + lr=self.qf_lr, + ) + + self.total_it = 0 + + def update_target_network(self, soft_target_update_rate: float): + soft_update(self.target_critic_1, self.critic_1, soft_target_update_rate) + soft_update(self.target_critic_2, self.critic_2, soft_target_update_rate) + + def _alpha_and_alpha_loss(self, observations: torch.Tensor, log_pi: torch.Tensor): + if self.use_automatic_entropy_tuning: + alpha_loss = -( + self.log_alpha() * (log_pi + self.target_entropy).detach() + ).mean() + alpha = self.log_alpha().exp() * self.alpha_multiplier + else: + alpha_loss = observations.new_tensor(0.0) + alpha = observations.new_tensor(self.alpha_multiplier) + return alpha, alpha_loss + + def _policy_loss( + self, + observations: torch.Tensor, + actions: torch.Tensor, + new_actions: torch.Tensor, + alpha: torch.Tensor, + log_pi: torch.Tensor, + ) -> torch.Tensor: + if self.total_it <= self.bc_steps: + log_probs = self.actor.log_prob(observations, actions) + policy_loss = (alpha * log_pi - log_probs).mean() + else: + q_new_actions = torch.min( + self.critic_1(observations, new_actions), + self.critic_2(observations, new_actions), + ) + policy_loss = (alpha * log_pi - q_new_actions).mean() + return policy_loss + + def _q_loss( + self, + observations: torch.Tensor, + actions: torch.Tensor, + next_observations: torch.Tensor, + rewards: torch.Tensor, + dones: torch.Tensor, + alpha: torch.Tensor, + log_dict: Dict, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + q1_predicted = self.critic_1(observations, actions) + q2_predicted = self.critic_2(observations, actions) + + if self.cql_max_target_backup: + new_next_actions, next_log_pi = self.actor( + next_observations, repeat=self.cql_n_actions + ) + target_q_values, max_target_indices = torch.max( + torch.min( + self.target_critic_1(next_observations, new_next_actions), + self.target_critic_2(next_observations, new_next_actions), + ), + dim=-1, + ) + next_log_pi = torch.gather( + next_log_pi, -1, max_target_indices.unsqueeze(-1) + ).squeeze(-1) + else: + new_next_actions, next_log_pi = self.actor(next_observations) + target_q_values = torch.min( + self.target_critic_1(next_observations, new_next_actions), + self.target_critic_2(next_observations, new_next_actions), + ) + + if self.backup_entropy: + target_q_values = target_q_values - alpha * next_log_pi + + target_q_values = target_q_values.unsqueeze(-1) + td_target = rewards + (1.0 - dones) * self.discount * target_q_values.detach() + td_target = td_target.squeeze(-1) + qf1_loss = F.mse_loss(q1_predicted, td_target.detach()) + qf2_loss = F.mse_loss(q2_predicted, td_target.detach()) + + # CQL + batch_size = actions.shape[0] + action_dim = actions.shape[-1] + cql_random_actions = actions.new_empty( + (batch_size, self.cql_n_actions, action_dim), requires_grad=False + ).uniform_(-1, 1) + cql_current_actions, cql_current_log_pis = self.actor( + observations, repeat=self.cql_n_actions + ) + cql_next_actions, cql_next_log_pis = self.actor( + next_observations, repeat=self.cql_n_actions + ) + cql_current_actions, cql_current_log_pis = ( + cql_current_actions.detach(), + cql_current_log_pis.detach(), + ) + cql_next_actions, cql_next_log_pis = ( + cql_next_actions.detach(), + cql_next_log_pis.detach(), + ) + + cql_q1_rand = self.critic_1(observations, cql_random_actions) + cql_q2_rand = self.critic_2(observations, cql_random_actions) + cql_q1_current_actions = self.critic_1(observations, cql_current_actions) + cql_q2_current_actions = self.critic_2(observations, cql_current_actions) + cql_q1_next_actions = self.critic_1(observations, cql_next_actions) + cql_q2_next_actions = self.critic_2(observations, cql_next_actions) + + cql_cat_q1 = torch.cat( + [ + cql_q1_rand, + torch.unsqueeze(q1_predicted, 1), + cql_q1_next_actions, + cql_q1_current_actions, + ], + dim=1, + ) + cql_cat_q2 = torch.cat( + [ + cql_q2_rand, + torch.unsqueeze(q2_predicted, 1), + cql_q2_next_actions, + cql_q2_current_actions, + ], + dim=1, + ) + cql_std_q1 = torch.std(cql_cat_q1, dim=1) + cql_std_q2 = torch.std(cql_cat_q2, dim=1) + + if self.cql_importance_sample: + random_density = np.log(0.5**action_dim) + cql_cat_q1 = torch.cat( + [ + cql_q1_rand - random_density, + cql_q1_next_actions - cql_next_log_pis.detach(), + cql_q1_current_actions - cql_current_log_pis.detach(), + ], + dim=1, + ) + cql_cat_q2 = torch.cat( + [ + cql_q2_rand - random_density, + cql_q2_next_actions - cql_next_log_pis.detach(), + cql_q2_current_actions - cql_current_log_pis.detach(), + ], + dim=1, + ) + + cql_qf1_ood = torch.logsumexp(cql_cat_q1 / self.cql_temp, dim=1) * self.cql_temp + cql_qf2_ood = torch.logsumexp(cql_cat_q2 / self.cql_temp, dim=1) * self.cql_temp + + """Subtract the log likelihood of data""" + cql_qf1_diff = torch.clamp( + cql_qf1_ood - q1_predicted, + self.cql_clip_diff_min, + self.cql_clip_diff_max, + ).mean() + cql_qf2_diff = torch.clamp( + cql_qf2_ood - q2_predicted, + self.cql_clip_diff_min, + self.cql_clip_diff_max, + ).mean() + + if self.cql_lagrange: + alpha_prime = torch.clamp( + torch.exp(self.log_alpha_prime()), min=0.0, max=1000000.0 + ) + cql_min_qf1_loss = ( + alpha_prime + * self.cql_alpha + * (cql_qf1_diff - self.cql_target_action_gap) + ) + cql_min_qf2_loss = ( + alpha_prime + * self.cql_alpha + * (cql_qf2_diff - self.cql_target_action_gap) + ) + + self.alpha_prime_optimizer.zero_grad() + alpha_prime_loss = (-cql_min_qf1_loss - cql_min_qf2_loss) * 0.5 + alpha_prime_loss.backward(retain_graph=True) + self.alpha_prime_optimizer.step() + else: + cql_min_qf1_loss = cql_qf1_diff * self.cql_alpha + cql_min_qf2_loss = cql_qf2_diff * self.cql_alpha + alpha_prime_loss = observations.new_tensor(0.0) + alpha_prime = observations.new_tensor(0.0) + + qf_loss = qf1_loss + qf2_loss + cql_min_qf1_loss + cql_min_qf2_loss + + log_dict.update( + dict( + qf1_loss=qf1_loss.item(), + qf2_loss=qf2_loss.item(), + alpha=alpha.item(), + average_qf1=q1_predicted.mean().item(), + average_qf2=q2_predicted.mean().item(), + average_target_q=target_q_values.mean().item(), + ) + ) + + log_dict.update( + dict( + cql_std_q1=cql_std_q1.mean().item(), + cql_std_q2=cql_std_q2.mean().item(), + cql_q1_rand=cql_q1_rand.mean().item(), + cql_q2_rand=cql_q2_rand.mean().item(), + cql_min_qf1_loss=cql_min_qf1_loss.mean().item(), + cql_min_qf2_loss=cql_min_qf2_loss.mean().item(), + cql_qf1_diff=cql_qf1_diff.mean().item(), + cql_qf2_diff=cql_qf2_diff.mean().item(), + cql_q1_current_actions=cql_q1_current_actions.mean().item(), + cql_q2_current_actions=cql_q2_current_actions.mean().item(), + cql_q1_next_actions=cql_q1_next_actions.mean().item(), + cql_q2_next_actions=cql_q2_next_actions.mean().item(), + alpha_prime_loss=alpha_prime_loss.item(), + alpha_prime=alpha_prime.item(), + ) + ) + + return qf_loss, alpha_prime, alpha_prime_loss + + def train(self, batch: TensorBatch) -> Dict[str, float]: + ( + observations, + actions, + rewards, + next_observations, + dones, + ) = batch + self.total_it += 1 + + new_actions, log_pi = self.actor(observations) + + alpha, alpha_loss = self._alpha_and_alpha_loss(observations, log_pi) + + """ Policy loss """ + policy_loss = self._policy_loss( + observations, actions, new_actions, alpha, log_pi + ) + + log_dict = dict( + log_pi=log_pi.mean().item(), + policy_loss=policy_loss.item(), + alpha_loss=alpha_loss.item(), + alpha=alpha.item(), + ) + + """ Q function loss """ + qf_loss, alpha_prime, alpha_prime_loss = self._q_loss( + observations, actions, next_observations, rewards, dones, alpha, log_dict + ) + + if self.use_automatic_entropy_tuning: + self.alpha_optimizer.zero_grad() + alpha_loss.backward() + self.alpha_optimizer.step() + + self.actor_optimizer.zero_grad() + policy_loss.backward() + self.actor_optimizer.step() + + self.critic_1_optimizer.zero_grad() + self.critic_2_optimizer.zero_grad() + qf_loss.backward(retain_graph=True) + self.critic_1_optimizer.step() + self.critic_2_optimizer.step() + + if self.total_it % self.target_update_period == 0: + self.update_target_network(self.soft_target_update_rate) + + return log_dict + + def state_dict(self) -> Dict[str, Any]: + return { + "actor": self.actor.state_dict(), + "critic1": self.critic_1.state_dict(), + "critic2": self.critic_2.state_dict(), + "critic1_target": self.target_critic_1.state_dict(), + "critic2_target": self.target_critic_2.state_dict(), + "critic_1_optimizer": self.critic_1_optimizer.state_dict(), + "critic_2_optimizer": self.critic_2_optimizer.state_dict(), + "actor_optim": self.actor_optimizer.state_dict(), + "sac_log_alpha": self.log_alpha, + "sac_log_alpha_optim": self.alpha_optimizer.state_dict(), + "cql_log_alpha": self.log_alpha_prime, + "cql_log_alpha_optim": self.alpha_prime_optimizer.state_dict(), + "total_it": self.total_it, + } + + def load_state_dict(self, state_dict: Dict[str, Any]): + self.actor.load_state_dict(state_dict=state_dict["actor"]) + self.critic_1.load_state_dict(state_dict=state_dict["critic1"]) + self.critic_2.load_state_dict(state_dict=state_dict["critic2"]) + + self.target_critic_1.load_state_dict(state_dict=state_dict["critic1_target"]) + self.target_critic_2.load_state_dict(state_dict=state_dict["critic2_target"]) + + self.critic_1_optimizer.load_state_dict( + state_dict=state_dict["critic_1_optimizer"] + ) + self.critic_2_optimizer.load_state_dict( + state_dict=state_dict["critic_2_optimizer"] + ) + self.actor_optimizer.load_state_dict(state_dict=state_dict["actor_optim"]) + + self.log_alpha = state_dict["sac_log_alpha"] + self.alpha_optimizer.load_state_dict( + state_dict=state_dict["sac_log_alpha_optim"] + ) + + self.log_alpha_prime = state_dict["cql_log_alpha"] + self.alpha_prime_optimizer.load_state_dict( + state_dict=state_dict["cql_log_alpha_optim"] + ) + self.total_it = state_dict["total_it"] + + +@pyrallis.wrap() +def train(config: TrainConfig): + env = gym.make(config.env) + eval_env = gym.make(config.env) + + is_env_with_goal = config.env.startswith(ENVS_WITH_GOAL) + + max_steps = env._max_episode_steps + + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.shape[0] + + dataset = d4rl.qlearning_dataset(env) + + reward_mod_dict = {} + if config.normalize_reward: + reward_mod_dict = modify_reward( + dataset, + config.env, + reward_scale=config.reward_scale, + reward_bias=config.reward_bias, + ) + + if config.normalize: + state_mean, state_std = compute_mean_std(dataset["observations"], eps=1e-3) + else: + state_mean, state_std = 0, 1 + + dataset["observations"] = normalize_states( + dataset["observations"], state_mean, state_std + ) + dataset["next_observations"] = normalize_states( + dataset["next_observations"], state_mean, state_std + ) + env = wrap_env(env, state_mean=state_mean, state_std=state_std) + eval_env = wrap_env(eval_env, state_mean=state_mean, state_std=state_std) + replay_buffer = ReplayBuffer( + state_dim, + action_dim, + config.buffer_size, + config.device, + ) + replay_buffer.load_d4rl_dataset(dataset) + + max_action = float(env.action_space.high[0]) + + if config.checkpoints_path is not None: + print(f"Checkpoints path: {config.checkpoints_path}") + os.makedirs(config.checkpoints_path, exist_ok=True) + with open(os.path.join(config.checkpoints_path, "config.yaml"), "w") as f: + pyrallis.dump(config, f) + + # Set seeds + seed = config.seed + set_seed(seed, env) + set_env_seed(eval_env, config.eval_seed) + + critic_1 = FullyConnectedQFunction( + state_dim, + action_dim, + config.orthogonal_init, + config.q_n_hidden_layers, + ).to(config.device) + critic_2 = FullyConnectedQFunction( + state_dim, + action_dim, + config.orthogonal_init, + config.q_n_hidden_layers, + ).to(config.device) + critic_1_optimizer = torch.optim.Adam(list(critic_1.parameters()), config.qf_lr) + critic_2_optimizer = torch.optim.Adam(list(critic_2.parameters()), config.qf_lr) + + actor = TanhGaussianPolicy( + state_dim, action_dim, max_action, orthogonal_init=config.orthogonal_init + ).to(config.device) + actor_optimizer = torch.optim.Adam(actor.parameters(), config.policy_lr) + + kwargs = { + "critic_1": critic_1, + "critic_2": critic_2, + "critic_1_optimizer": critic_1_optimizer, + "critic_2_optimizer": critic_2_optimizer, + "actor": actor, + "actor_optimizer": actor_optimizer, + "discount": config.discount, + "soft_target_update_rate": config.soft_target_update_rate, + "device": config.device, + # CQL + "target_entropy": -np.prod(env.action_space.shape).item(), + "alpha_multiplier": config.alpha_multiplier, + "use_automatic_entropy_tuning": config.use_automatic_entropy_tuning, + "backup_entropy": config.backup_entropy, + "policy_lr": config.policy_lr, + "qf_lr": config.qf_lr, + "bc_steps": config.bc_steps, + "target_update_period": config.target_update_period, + "cql_n_actions": config.cql_n_actions, + "cql_importance_sample": config.cql_importance_sample, + "cql_lagrange": config.cql_lagrange, + "cql_target_action_gap": config.cql_target_action_gap, + "cql_temp": config.cql_temp, + "cql_alpha": config.cql_alpha, + "cql_max_target_backup": config.cql_max_target_backup, + "cql_clip_diff_min": config.cql_clip_diff_min, + "cql_clip_diff_max": config.cql_clip_diff_max, + } + + print("---------------------------------------") + print(f"Training CQL, Env: {config.env}, Seed: {seed}") + print("---------------------------------------") + + # Initialize actor + trainer = ContinuousCQL(**kwargs) + + if config.load_model != "": + policy_file = Path(config.load_model) + trainer.load_state_dict(torch.load(policy_file)) + actor = trainer.actor + + wandb_init(asdict(config)) + + evaluations = [] + state, done = env.reset(), False + episode_return = 0 + episode_step = 0 + goal_achieved = False + + eval_successes = [] + train_successes = [] + + print("Offline pretraining") + for t in range(int(config.offline_iterations) + int(config.online_iterations)): + if t == config.offline_iterations: + print("Online tuning") + trainer.cql_alpha = config.cql_alpha_online + online_log = {} + if t >= config.offline_iterations: + episode_step += 1 + action, _ = actor( + torch.tensor( + state.reshape(1, -1), device=config.device, dtype=torch.float32 + ) + ) + action = action.cpu().data.numpy().flatten() + next_state, reward, done, env_infos = env.step(action) + if not goal_achieved: + goal_achieved = is_goal_reached(reward, env_infos) + + episode_return += reward + real_done = False # Episode can timeout which is different from done + if done and episode_step < max_steps: + real_done = True + + if config.normalize_reward: + reward = modify_reward_online( + reward, + config.env, + reward_scale=config.reward_scale, + reward_bias=config.reward_bias, + **reward_mod_dict, + ) + replay_buffer.add_transition(state, action, reward, next_state, real_done) + state = next_state + + if done: + state, done = env.reset(), False + # Valid only for envs with goal, e.g. AntMaze, Adroit + if is_env_with_goal: + train_successes.append(goal_achieved) + online_log["train/regret"] = np.mean(1 - np.array(train_successes)) + online_log["train/is_success"] = float(goal_achieved) + online_log["train/episode_return"] = episode_return + normalized_return = eval_env.get_normalized_score(episode_return) + online_log["train/d4rl_normalized_episode_return"] = ( + normalized_return * 100.0 + ) + online_log["train/episode_length"] = episode_step + episode_return = 0 + episode_step = 0 + goal_achieved = False + + batch = replay_buffer.sample(config.batch_size) + batch = [b.to(config.device) for b in batch] + log_dict = trainer.train(batch) + log_dict["offline_iter" if t < config.offline_iterations else "online_iter"] = ( + t if t < config.offline_iterations else t - config.offline_iterations + ) + log_dict.update(online_log) + wandb.log(log_dict, step=trainer.total_it) + # Evaluate episode + if (t + 1) % config.eval_freq == 0: + print(f"Time steps: {t + 1}") + eval_scores, success_rate = eval_actor( + eval_env, + actor, + device=config.device, + n_episodes=config.n_episodes, + seed=config.seed, + ) + eval_score = eval_scores.mean() + eval_log = {} + normalized = eval_env.get_normalized_score(np.mean(eval_scores)) + # Valid only for envs with goal, e.g. AntMaze, Adroit + if t >= config.offline_iterations and is_env_with_goal: + eval_successes.append(success_rate) + eval_log["eval/regret"] = np.mean(1 - np.array(train_successes)) + eval_log["eval/success_rate"] = success_rate + normalized_eval_score = normalized * 100.0 + eval_log["eval/d4rl_normalized_score"] = normalized_eval_score + evaluations.append(normalized_eval_score) + print("---------------------------------------") + print( + f"Evaluation over {config.n_episodes} episodes: " + f"{eval_score:.3f} , D4RL score: {normalized_eval_score:.3f}" + ) + print("---------------------------------------") + if config.checkpoints_path: + torch.save( + trainer.state_dict(), + os.path.join(config.checkpoints_path, f"checkpoint_{t}.pt"), + ) + wandb.log(eval_log, step=trainer.total_it) + + +if __name__ == "__main__": + train() diff --git a/algorithms/finetune/iql.py b/algorithms/finetune/iql.py new file mode 100644 index 00000000..8e743aa9 --- /dev/null +++ b/algorithms/finetune/iql.py @@ -0,0 +1,771 @@ +# source: https://github.com/gwthomas/IQL-PyTorch +# https://arxiv.org/pdf/2110.06169.pdf +import copy +import os +import random +import uuid +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import d4rl +import gym +import numpy as np +import pyrallis +import torch +import torch.nn as nn +import torch.nn.functional as F +import wandb +from torch.distributions import Normal +from torch.optim.lr_scheduler import CosineAnnealingLR + +TensorBatch = List[torch.Tensor] + + +EXP_ADV_MAX = 100.0 +LOG_STD_MIN = -20.0 +LOG_STD_MAX = 2.0 +ENVS_WITH_GOAL = ("antmaze", "pen", "door", "hammer", "relocate") + + +@dataclass +class TrainConfig: + # Experiment + device: str = "cuda" + env: str = "antmaze-umaze-v2" # OpenAI gym environment name + seed: int = 0 # Sets Gym, PyTorch and Numpy seeds + eval_seed: int = 0 # Eval environment seed + eval_freq: int = int(5e4) # How often (time steps) we evaluate + n_episodes: int = 100 # How many episodes run during evaluation + offline_iterations: int = int(1e6) # Number of offline updates + online_iterations: int = int(1e6) # Number of online updates + checkpoints_path: Optional[str] = None # Save path + load_model: str = "" # Model load file name, "" doesn't load + # IQL + actor_dropout: float = 0.0 # Dropout in actor network + buffer_size: int = 2_000_000 # Replay buffer size + batch_size: int = 256 # Batch size for all networks + discount: float = 0.99 # Discount factor + tau: float = 0.005 # Target network update rate + beta: float = 3.0 # Inverse temperature. Small beta -> BC, big beta -> maximizing Q + iql_tau: float = 0.7 # Coefficient for asymmetric loss + expl_noise: float = 0.03 # Std of Gaussian exploration noise + noise_clip: float = 0.5 # Range to clip noise + iql_deterministic: bool = False # Use deterministic actor + normalize: bool = True # Normalize states + normalize_reward: bool = False # Normalize reward + vf_lr: float = 3e-4 # V function learning rate + qf_lr: float = 3e-4 # Critic learning rate + actor_lr: float = 3e-4 # Actor learning rate + # Wandb logging + project: str = "CORL" + group: str = "IQL-D4RL" + name: str = "IQL" + + def __post_init__(self): + self.name = f"{self.name}-{self.env}-{str(uuid.uuid4())[:8]}" + if self.checkpoints_path is not None: + self.checkpoints_path = os.path.join(self.checkpoints_path, self.name) + + +def soft_update(target: nn.Module, source: nn.Module, tau: float): + for target_param, source_param in zip(target.parameters(), source.parameters()): + target_param.data.copy_((1 - tau) * target_param.data + tau * source_param.data) + + +def compute_mean_std(states: np.ndarray, eps: float) -> Tuple[np.ndarray, np.ndarray]: + mean = states.mean(0) + std = states.std(0) + eps + return mean, std + + +def normalize_states(states: np.ndarray, mean: np.ndarray, std: np.ndarray): + return (states - mean) / std + + +def wrap_env( + env: gym.Env, + state_mean: Union[np.ndarray, float] = 0.0, + state_std: Union[np.ndarray, float] = 1.0, + reward_scale: float = 1.0, +) -> gym.Env: + # PEP 8: E731 do not assign a lambda expression, use a def + def normalize_state(state): + return ( + state - state_mean + ) / state_std # epsilon should be already added in std. + + def scale_reward(reward): + # Please be careful, here reward is multiplied by scale! + return reward_scale * reward + + env = gym.wrappers.TransformObservation(env, normalize_state) + if reward_scale != 1.0: + env = gym.wrappers.TransformReward(env, scale_reward) + return env + + +class ReplayBuffer: + def __init__( + self, + state_dim: int, + action_dim: int, + buffer_size: int, + device: str = "cpu", + ): + self._buffer_size = buffer_size + self._pointer = 0 + self._size = 0 + + self._states = torch.zeros( + (buffer_size, state_dim), dtype=torch.float32, device=device + ) + self._actions = torch.zeros( + (buffer_size, action_dim), dtype=torch.float32, device=device + ) + self._rewards = torch.zeros((buffer_size, 1), dtype=torch.float32, device=device) + self._next_states = torch.zeros( + (buffer_size, state_dim), dtype=torch.float32, device=device + ) + self._dones = torch.zeros((buffer_size, 1), dtype=torch.float32, device=device) + self._device = device + + def _to_tensor(self, data: np.ndarray) -> torch.Tensor: + return torch.tensor(data, dtype=torch.float32, device=self._device) + + # Loads data in d4rl format, i.e. from Dict[str, np.array]. + def load_d4rl_dataset(self, data: Dict[str, np.ndarray]): + if self._size != 0: + raise ValueError("Trying to load data into non-empty replay buffer") + n_transitions = data["observations"].shape[0] + if n_transitions > self._buffer_size: + raise ValueError( + "Replay buffer is smaller than the dataset you are trying to load!" + ) + self._states[:n_transitions] = self._to_tensor(data["observations"]) + self._actions[:n_transitions] = self._to_tensor(data["actions"]) + self._rewards[:n_transitions] = self._to_tensor(data["rewards"][..., None]) + self._next_states[:n_transitions] = self._to_tensor(data["next_observations"]) + self._dones[:n_transitions] = self._to_tensor(data["terminals"][..., None]) + self._size += n_transitions + self._pointer = min(self._size, n_transitions) + + print(f"Dataset size: {n_transitions}") + + def sample(self, batch_size: int) -> TensorBatch: + indices = np.random.randint(0, self._size, size=batch_size) + states = self._states[indices] + actions = self._actions[indices] + rewards = self._rewards[indices] + next_states = self._next_states[indices] + dones = self._dones[indices] + return [states, actions, rewards, next_states, dones] + + def add_transition( + self, + state: np.ndarray, + action: np.ndarray, + reward: float, + next_state: np.ndarray, + done: bool, + ): + # Use this method to add new data into the replay buffer during fine-tuning. + self._states[self._pointer] = self._to_tensor(state) + self._actions[self._pointer] = self._to_tensor(action) + self._rewards[self._pointer] = self._to_tensor(reward) + self._next_states[self._pointer] = self._to_tensor(next_state) + self._dones[self._pointer] = self._to_tensor(done) + + self._pointer = (self._pointer + 1) % self._buffer_size + self._size = min(self._size + 1, self._buffer_size) + # raise NotImplementedError + + +def set_env_seed(env: Optional[gym.Env], seed: int): + env.seed(seed) + env.action_space.seed(seed) + + +def set_seed( + seed: int, env: Optional[gym.Env] = None, deterministic_torch: bool = False +): + if env is not None: + set_env_seed(env, seed) + os.environ["PYTHONHASHSEED"] = str(seed) + np.random.seed(seed) + random.seed(seed) + torch.manual_seed(seed) + torch.use_deterministic_algorithms(deterministic_torch) + + +def wandb_init(config: dict) -> None: + wandb.init( + config=config, + project=config["project"], + group=config["group"], + name=config["name"], + id=str(uuid.uuid4()), + ) + wandb.run.save() + + +def is_goal_reached(reward: float, info: Dict) -> bool: + if "goal_achieved" in info: + return info["goal_achieved"] + return reward > 0 # Assuming that reaching target is a positive reward + + +@torch.no_grad() +def eval_actor( + env: gym.Env, actor: nn.Module, device: str, n_episodes: int, seed: int +) -> Tuple[np.ndarray, np.ndarray]: + env.seed(seed) + actor.eval() + episode_rewards = [] + successes = [] + for _ in range(n_episodes): + state, done = env.reset(), False + episode_reward = 0.0 + goal_achieved = False + while not done: + action = actor.act(state, device) + state, reward, done, env_infos = env.step(action) + episode_reward += reward + if not goal_achieved: + goal_achieved = is_goal_reached(reward, env_infos) + # Valid only for environments with goal + successes.append(float(goal_achieved)) + episode_rewards.append(episode_reward) + + actor.train() + return np.asarray(episode_rewards), np.mean(successes) + + +def return_reward_range(dataset: Dict, max_episode_steps: int) -> Tuple[float, float]: + returns, lengths = [], [] + ep_ret, ep_len = 0.0, 0 + for r, d in zip(dataset["rewards"], dataset["terminals"]): + ep_ret += float(r) + ep_len += 1 + if d or ep_len == max_episode_steps: + returns.append(ep_ret) + lengths.append(ep_len) + ep_ret, ep_len = 0.0, 0 + lengths.append(ep_len) # but still keep track of number of steps + assert sum(lengths) == len(dataset["rewards"]) + return min(returns), max(returns) + + +def modify_reward(dataset: Dict, env_name: str, max_episode_steps: int = 1000) -> Dict: + if any(s in env_name for s in ("halfcheetah", "hopper", "walker2d")): + min_ret, max_ret = return_reward_range(dataset, max_episode_steps) + dataset["rewards"] /= max_ret - min_ret + dataset["rewards"] *= max_episode_steps + return { + "max_ret": max_ret, + "min_ret": min_ret, + "max_episode_steps": max_episode_steps, + } + elif "antmaze" in env_name: + dataset["rewards"] -= 1.0 + return {} + + +def modify_reward_online(reward: float, env_name: str, **kwargs) -> float: + if any(s in env_name for s in ("halfcheetah", "hopper", "walker2d")): + reward /= kwargs["max_ret"] - kwargs["min_ret"] + reward *= kwargs["max_episode_steps"] + elif "antmaze" in env_name: + reward -= 1.0 + return reward + + +def asymmetric_l2_loss(u: torch.Tensor, tau: float) -> torch.Tensor: + return torch.mean(torch.abs(tau - (u < 0).float()) * u**2) + + +class Squeeze(nn.Module): + def __init__(self, dim=-1): + super().__init__() + self.dim = dim + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x.squeeze(dim=self.dim) + + +class MLP(nn.Module): + def __init__( + self, + dims, + activation_fn: Callable[[], nn.Module] = nn.ReLU, + output_activation_fn: Callable[[], nn.Module] = None, + squeeze_output: bool = False, + dropout: float = 0.0, + ): + super().__init__() + n_dims = len(dims) + if n_dims < 2: + raise ValueError("MLP requires at least two dims (input and output)") + + layers = [] + for i in range(n_dims - 2): + layers.append(nn.Linear(dims[i], dims[i + 1])) + layers.append(activation_fn()) + if dropout > 0.0: + layers.append(nn.Dropout(dropout)) + layers.append(nn.Linear(dims[-2], dims[-1])) + if output_activation_fn is not None: + layers.append(output_activation_fn()) + if squeeze_output: + if dims[-1] != 1: + raise ValueError("Last dim must be 1 when squeezing") + layers.append(Squeeze(-1)) + self.net = nn.Sequential(*layers) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.net(x) + + +class GaussianPolicy(nn.Module): + def __init__( + self, + state_dim: int, + act_dim: int, + max_action: float, + hidden_dim: int = 256, + n_hidden: int = 2, + dropout: float = 0.0, + ): + super().__init__() + self.net = MLP( + [state_dim, *([hidden_dim] * n_hidden), act_dim], + output_activation_fn=nn.Tanh, + dropout=dropout, + ) + self.log_std = nn.Parameter(torch.zeros(act_dim, dtype=torch.float32)) + self.max_action = max_action + + def forward(self, obs: torch.Tensor) -> Normal: + mean = self.net(obs) + std = torch.exp(self.log_std.clamp(LOG_STD_MIN, LOG_STD_MAX)) + return Normal(mean, std) + + @torch.no_grad() + def act(self, state: np.ndarray, device: str = "cpu"): + state = torch.tensor(state.reshape(1, -1), device=device, dtype=torch.float32) + dist = self(state) + action = dist.mean if not self.training else dist.sample() + action = torch.clamp(self.max_action * action, -self.max_action, self.max_action) + return action.cpu().data.numpy().flatten() + + +class DeterministicPolicy(nn.Module): + def __init__( + self, + state_dim: int, + act_dim: int, + max_action: float, + hidden_dim: int = 256, + n_hidden: int = 2, + dropout: float = 0.0, + ): + super().__init__() + self.net = MLP( + [state_dim, *([hidden_dim] * n_hidden), act_dim], + output_activation_fn=nn.Tanh, + dropout=dropout, + ) + self.max_action = max_action + + def forward(self, obs: torch.Tensor) -> torch.Tensor: + return self.net(obs) + + @torch.no_grad() + def act(self, state: np.ndarray, device: str = "cpu"): + state = torch.tensor(state.reshape(1, -1), device=device, dtype=torch.float32) + return ( + torch.clamp(self(state) * self.max_action, -self.max_action, self.max_action) + .cpu() + .data.numpy() + .flatten() + ) + + +class TwinQ(nn.Module): + def __init__( + self, state_dim: int, action_dim: int, hidden_dim: int = 256, n_hidden: int = 2 + ): + super().__init__() + dims = [state_dim + action_dim, *([hidden_dim] * n_hidden), 1] + self.q1 = MLP(dims, squeeze_output=True) + self.q2 = MLP(dims, squeeze_output=True) + + def both( + self, state: torch.Tensor, action: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + sa = torch.cat([state, action], 1) + return self.q1(sa), self.q2(sa) + + def forward(self, state: torch.Tensor, action: torch.Tensor) -> torch.Tensor: + return torch.min(*self.both(state, action)) + + +class ValueFunction(nn.Module): + def __init__(self, state_dim: int, hidden_dim: int = 256, n_hidden: int = 2): + super().__init__() + dims = [state_dim, *([hidden_dim] * n_hidden), 1] + self.v = MLP(dims, squeeze_output=True) + + def forward(self, state: torch.Tensor) -> torch.Tensor: + return self.v(state) + + +class ImplicitQLearning: + def __init__( + self, + max_action: float, + actor: nn.Module, + actor_optimizer: torch.optim.Optimizer, + q_network: nn.Module, + q_optimizer: torch.optim.Optimizer, + v_network: nn.Module, + v_optimizer: torch.optim.Optimizer, + iql_tau: float = 0.7, + beta: float = 3.0, + max_steps: int = 1000000, + discount: float = 0.99, + tau: float = 0.005, + device: str = "cpu", + ): + self.max_action = max_action + self.qf = q_network + self.q_target = copy.deepcopy(self.qf).requires_grad_(False).to(device) + self.vf = v_network + self.actor = actor + self.v_optimizer = v_optimizer + self.q_optimizer = q_optimizer + self.actor_optimizer = actor_optimizer + self.actor_lr_schedule = CosineAnnealingLR(self.actor_optimizer, max_steps) + self.iql_tau = iql_tau + self.beta = beta + self.discount = discount + self.tau = tau + + self.total_it = 0 + self.device = device + + def _update_v(self, observations, actions, log_dict) -> torch.Tensor: + # Update value function + with torch.no_grad(): + target_q = self.q_target(observations, actions) + + v = self.vf(observations) + adv = target_q - v + v_loss = asymmetric_l2_loss(adv, self.iql_tau) + log_dict["value_loss"] = v_loss.item() + self.v_optimizer.zero_grad() + v_loss.backward() + self.v_optimizer.step() + return adv + + def _update_q( + self, + next_v: torch.Tensor, + observations: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + terminals: torch.Tensor, + log_dict: Dict, + ): + targets = rewards + (1.0 - terminals.float()) * self.discount * next_v.detach() + qs = self.qf.both(observations, actions) + q_loss = sum(F.mse_loss(q, targets) for q in qs) / len(qs) + log_dict["q_loss"] = q_loss.item() + self.q_optimizer.zero_grad() + q_loss.backward() + self.q_optimizer.step() + + # Update target Q network + soft_update(self.q_target, self.qf, self.tau) + + def _update_policy( + self, + adv: torch.Tensor, + observations: torch.Tensor, + actions: torch.Tensor, + log_dict: Dict, + ): + exp_adv = torch.exp(self.beta * adv.detach()).clamp(max=EXP_ADV_MAX) + policy_out = self.actor(observations) + if isinstance(policy_out, torch.distributions.Distribution): + bc_losses = -policy_out.log_prob(actions).sum(-1, keepdim=False) + elif torch.is_tensor(policy_out): + if policy_out.shape != actions.shape: + raise RuntimeError("Actions shape missmatch") + bc_losses = torch.sum((policy_out - actions) ** 2, dim=1) + else: + raise NotImplementedError + policy_loss = torch.mean(exp_adv * bc_losses) + log_dict["actor_loss"] = policy_loss.item() + self.actor_optimizer.zero_grad() + policy_loss.backward() + self.actor_optimizer.step() + self.actor_lr_schedule.step() + + def train(self, batch: TensorBatch) -> Dict[str, float]: + self.total_it += 1 + ( + observations, + actions, + rewards, + next_observations, + dones, + ) = batch + log_dict = {} + + with torch.no_grad(): + next_v = self.vf(next_observations) + # Update value function + adv = self._update_v(observations, actions, log_dict) + rewards = rewards.squeeze(dim=-1) + dones = dones.squeeze(dim=-1) + # Update Q function + self._update_q(next_v, observations, actions, rewards, dones, log_dict) + # Update actor + self._update_policy(adv, observations, actions, log_dict) + + return log_dict + + def state_dict(self) -> Dict[str, Any]: + return { + "qf": self.qf.state_dict(), + "q_optimizer": self.q_optimizer.state_dict(), + "vf": self.vf.state_dict(), + "v_optimizer": self.v_optimizer.state_dict(), + "actor": self.actor.state_dict(), + "actor_optimizer": self.actor_optimizer.state_dict(), + "actor_lr_schedule": self.actor_lr_schedule.state_dict(), + "total_it": self.total_it, + } + + def load_state_dict(self, state_dict: Dict[str, Any]): + self.qf.load_state_dict(state_dict["qf"]) + self.q_optimizer.load_state_dict(state_dict["q_optimizer"]) + self.q_target = copy.deepcopy(self.qf) + + self.vf.load_state_dict(state_dict["vf"]) + self.v_optimizer.load_state_dict(state_dict["v_optimizer"]) + + self.actor.load_state_dict(state_dict["actor"]) + self.actor_optimizer.load_state_dict(state_dict["actor_optimizer"]) + self.actor_lr_schedule.load_state_dict(state_dict["actor_lr_schedule"]) + + self.total_it = state_dict["total_it"] + + +@pyrallis.wrap() +def train(config: TrainConfig): + env = gym.make(config.env) + eval_env = gym.make(config.env) + + is_env_with_goal = config.env.startswith(ENVS_WITH_GOAL) + + max_steps = env._max_episode_steps + + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.shape[0] + + dataset = d4rl.qlearning_dataset(env) + + reward_mod_dict = {} + if config.normalize_reward: + reward_mod_dict = modify_reward(dataset, config.env) + + if config.normalize: + state_mean, state_std = compute_mean_std(dataset["observations"], eps=1e-3) + else: + state_mean, state_std = 0, 1 + + dataset["observations"] = normalize_states( + dataset["observations"], state_mean, state_std + ) + dataset["next_observations"] = normalize_states( + dataset["next_observations"], state_mean, state_std + ) + env = wrap_env(env, state_mean=state_mean, state_std=state_std) + eval_env = wrap_env(eval_env, state_mean=state_mean, state_std=state_std) + replay_buffer = ReplayBuffer( + state_dim, + action_dim, + config.buffer_size, + config.device, + ) + replay_buffer.load_d4rl_dataset(dataset) + + max_action = float(env.action_space.high[0]) + + if config.checkpoints_path is not None: + print(f"Checkpoints path: {config.checkpoints_path}") + os.makedirs(config.checkpoints_path, exist_ok=True) + with open(os.path.join(config.checkpoints_path, "config.yaml"), "w") as f: + pyrallis.dump(config, f) + + # Set seeds + seed = config.seed + set_seed(seed, env) + set_env_seed(eval_env, config.eval_seed) + + q_network = TwinQ(state_dim, action_dim).to(config.device) + v_network = ValueFunction(state_dim).to(config.device) + actor = ( + DeterministicPolicy( + state_dim, action_dim, max_action, dropout=config.actor_dropout + ) + if config.iql_deterministic + else GaussianPolicy( + state_dim, action_dim, max_action, dropout=config.actor_dropout + ) + ).to(config.device) + v_optimizer = torch.optim.Adam(v_network.parameters(), lr=config.vf_lr) + q_optimizer = torch.optim.Adam(q_network.parameters(), lr=config.qf_lr) + actor_optimizer = torch.optim.Adam(actor.parameters(), lr=config.actor_lr) + + kwargs = { + "max_action": max_action, + "actor": actor, + "actor_optimizer": actor_optimizer, + "q_network": q_network, + "q_optimizer": q_optimizer, + "v_network": v_network, + "v_optimizer": v_optimizer, + "discount": config.discount, + "tau": config.tau, + "device": config.device, + # IQL + "beta": config.beta, + "iql_tau": config.iql_tau, + "max_steps": config.offline_iterations, + } + + print("---------------------------------------") + print(f"Training IQL, Env: {config.env}, Seed: {seed}") + print("---------------------------------------") + + # Initialize actor + trainer = ImplicitQLearning(**kwargs) + + if config.load_model != "": + policy_file = Path(config.load_model) + trainer.load_state_dict(torch.load(policy_file)) + actor = trainer.actor + + wandb_init(asdict(config)) + + evaluations = [] + + state, done = env.reset(), False + episode_return = 0 + episode_step = 0 + goal_achieved = False + + eval_successes = [] + train_successes = [] + + print("Offline pretraining") + for t in range(int(config.offline_iterations) + int(config.online_iterations)): + if t == config.offline_iterations: + print("Online tuning") + online_log = {} + if t >= config.offline_iterations: + episode_step += 1 + action = actor( + torch.tensor( + state.reshape(1, -1), device=config.device, dtype=torch.float32 + ) + ) + if not config.iql_deterministic: + action = action.sample() + else: + noise = (torch.randn_like(action) * config.expl_noise).clamp( + -config.noise_clip, config.noise_clip + ) + action += noise + action = torch.clamp(max_action * action, -max_action, max_action) + action = action.cpu().data.numpy().flatten() + next_state, reward, done, env_infos = env.step(action) + + if not goal_achieved: + goal_achieved = is_goal_reached(reward, env_infos) + episode_return += reward + + real_done = False # Episode can timeout which is different from done + if done and episode_step < max_steps: + real_done = True + + if config.normalize_reward: + reward = modify_reward_online(reward, config.env, **reward_mod_dict) + + replay_buffer.add_transition(state, action, reward, next_state, real_done) + state = next_state + if done: + state, done = env.reset(), False + # Valid only for envs with goal, e.g. AntMaze, Adroit + if is_env_with_goal: + train_successes.append(goal_achieved) + online_log["train/regret"] = np.mean(1 - np.array(train_successes)) + online_log["train/is_success"] = float(goal_achieved) + online_log["train/episode_return"] = episode_return + normalized_return = eval_env.get_normalized_score(episode_return) + online_log["train/d4rl_normalized_episode_return"] = ( + normalized_return * 100.0 + ) + online_log["train/episode_length"] = episode_step + episode_return = 0 + episode_step = 0 + goal_achieved = False + + batch = replay_buffer.sample(config.batch_size) + batch = [b.to(config.device) for b in batch] + log_dict = trainer.train(batch) + log_dict["offline_iter" if t < config.offline_iterations else "online_iter"] = ( + t if t < config.offline_iterations else t - config.offline_iterations + ) + log_dict.update(online_log) + wandb.log(log_dict, step=trainer.total_it) + # Evaluate episode + if (t + 1) % config.eval_freq == 0: + print(f"Time steps: {t + 1}") + eval_scores, success_rate = eval_actor( + eval_env, + actor, + device=config.device, + n_episodes=config.n_episodes, + seed=config.seed, + ) + eval_score = eval_scores.mean() + eval_log = {} + normalized = eval_env.get_normalized_score(eval_score) + # Valid only for envs with goal, e.g. AntMaze, Adroit + if t >= config.offline_iterations and is_env_with_goal: + eval_successes.append(success_rate) + eval_log["eval/regret"] = np.mean(1 - np.array(train_successes)) + eval_log["eval/success_rate"] = success_rate + normalized_eval_score = normalized * 100.0 + evaluations.append(normalized_eval_score) + eval_log["eval/d4rl_normalized_score"] = normalized_eval_score + print("---------------------------------------") + print( + f"Evaluation over {config.n_episodes} episodes: " + f"{eval_score:.3f} , D4RL score: {normalized_eval_score:.3f}" + ) + print("---------------------------------------") + if config.checkpoints_path is not None: + torch.save( + trainer.state_dict(), + os.path.join(config.checkpoints_path, f"checkpoint_{t}.pt"), + ) + wandb.log(eval_log, step=trainer.total_it) + + +if __name__ == "__main__": + train() diff --git a/algorithms/finetune/spot.py b/algorithms/finetune/spot.py new file mode 100644 index 00000000..0cb3c3ea --- /dev/null +++ b/algorithms/finetune/spot.py @@ -0,0 +1,918 @@ +# source: https://github.com/thuml/SPOT/tree/58c591dc48fbd9ff632b7494eab4caf778e86f4a +# https://arxiv.org/pdf/2202.06239.pdf +import copy +import os +import random +import uuid +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union + +import d4rl +import gym +import numpy as np +import pyrallis +import torch +import torch.distributions as td +import torch.nn as nn +import torch.nn.functional as F +import wandb + +TensorBatch = List[torch.Tensor] +ENVS_WITH_GOAL = ("antmaze", "pen", "door", "hammer", "relocate") + + +@dataclass +class TrainConfig: + # Experiment + device: str = "cuda" + env: str = "antmaze-umaze-v2" # OpenAI gym environment name + seed: int = 0 # Sets Gym, PyTorch and Numpy seeds + eval_seed: int = 0 # Eval environment seed + eval_freq: int = int(5e3) # How often (time steps) we evaluate + n_episodes: int = 10 # How many episodes run during evaluation + offline_iterations: int = int(1e6) # Number of offline updates + online_iterations: int = int(1e6) # Number of online updates + checkpoints_path: Optional[str] = None # Save path + load_model: str = "" # Model load file name, "" doesn't load + # TD3 + actor_lr: float = 1e-4 # Actor learning ratev + critic_lr: float = 3e-4 # Actor learning rate + buffer_size: int = 20_000_000 # Replay buffer size + batch_size: int = 256 # Batch size for all networks + discount: float = 0.99 # Discount factor + expl_noise: float = 0.1 # Std of Gaussian exploration noise + tau: float = 0.005 # Target network update rate + policy_noise: float = 0.2 # Noise added to target actor during critic update + noise_clip: float = 0.5 # Range to clip target actor noise + policy_freq: int = 2 # Frequency of delayed actor updates + # SPOT VAE + vae_lr: float = 1e-3 # VAE learning rate + vae_hidden_dim: int = 750 # VAE hidden layers dimension + vae_latent_dim: Optional[int] = None # VAE latent space, 2 * action_dim if None + beta: float = 0.5 # KL loss weight + vae_iterations: int = 100_000 # Number of VAE training updates + # SPOT + actor_init_w: Optional[float] = None # Actor head init parameter + critic_init_w: Optional[float] = None # Critic head init parameter + lambd: float = 1.0 # Support constraint weight + num_samples: int = 1 # Number of samples for density estimation + iwae: bool = False # Use IWAE loss + lambd_cool: bool = False # Cooling lambda during fine-tune + lambd_end: float = 0.2 # Minimal value of lambda + normalize: bool = False # Normalize states + normalize_reward: bool = True # Normalize reward + online_discount: float = 0.995 # Discount for online tuning + # Wandb logging + project: str = "CORL" + group: str = "SPOT-D4RL" + name: str = "SPOT" + + def __post_init__(self): + self.name = f"{self.name}-{self.env}-{str(uuid.uuid4())[:8]}" + if self.checkpoints_path is not None: + self.checkpoints_path = os.path.join(self.checkpoints_path, self.name) + + +def soft_update(target: nn.Module, source: nn.Module, tau: float): + for target_param, source_param in zip(target.parameters(), source.parameters()): + target_param.data.copy_((1 - tau) * target_param.data + tau * source_param.data) + + +def compute_mean_std(states: np.ndarray, eps: float) -> Tuple[np.ndarray, np.ndarray]: + mean = states.mean(0) + std = states.std(0) + eps + return mean, std + + +def normalize_states(states: np.ndarray, mean: np.ndarray, std: np.ndarray): + return (states - mean) / std + + +def wrap_env( + env: gym.Env, + state_mean: Union[np.ndarray, float] = 0.0, + state_std: Union[np.ndarray, float] = 1.0, + reward_scale: float = 1.0, +) -> gym.Env: + # PEP 8: E731 do not assign a lambda expression, use a def + def normalize_state(state): + return ( + state - state_mean + ) / state_std # epsilon should be already added in std. + + def scale_reward(reward): + # Please be careful, here reward is multiplied by scale! + return reward_scale * reward + + env = gym.wrappers.TransformObservation(env, normalize_state) + if reward_scale != 1.0: + env = gym.wrappers.TransformReward(env, scale_reward) + return env + + +class ReplayBuffer: + def __init__( + self, + state_dim: int, + action_dim: int, + buffer_size: int, + device: str = "cpu", + ): + self._buffer_size = buffer_size + self._pointer = 0 + self._size = 0 + + self._states = torch.zeros( + (buffer_size, state_dim), dtype=torch.float32, device=device + ) + self._actions = torch.zeros( + (buffer_size, action_dim), dtype=torch.float32, device=device + ) + self._rewards = torch.zeros((buffer_size, 1), dtype=torch.float32, device=device) + self._next_states = torch.zeros( + (buffer_size, state_dim), dtype=torch.float32, device=device + ) + self._dones = torch.zeros((buffer_size, 1), dtype=torch.float32, device=device) + self._device = device + + def _to_tensor(self, data: np.ndarray) -> torch.Tensor: + return torch.tensor(data, dtype=torch.float32, device=self._device) + + # Loads data in d4rl format, i.e. from Dict[str, np.array]. + def load_d4rl_dataset(self, data: Dict[str, np.ndarray]): + if self._size != 0: + raise ValueError("Trying to load data into non-empty replay buffer") + n_transitions = data["observations"].shape[0] + if n_transitions > self._buffer_size: + raise ValueError( + "Replay buffer is smaller than the dataset you are trying to load!" + ) + self._states[:n_transitions] = self._to_tensor(data["observations"]) + self._actions[:n_transitions] = self._to_tensor(data["actions"]) + self._rewards[:n_transitions] = self._to_tensor(data["rewards"][..., None]) + self._next_states[:n_transitions] = self._to_tensor(data["next_observations"]) + self._dones[:n_transitions] = self._to_tensor(data["terminals"][..., None]) + self._size += n_transitions + self._pointer = min(self._size, n_transitions) + + print(f"Dataset size: {n_transitions}") + + def sample(self, batch_size: int) -> TensorBatch: + indices = np.random.randint(0, self._size, size=batch_size) + states = self._states[indices] + actions = self._actions[indices] + rewards = self._rewards[indices] + next_states = self._next_states[indices] + dones = self._dones[indices] + return [states, actions, rewards, next_states, dones] + + def add_transition( + self, + state: np.ndarray, + action: np.ndarray, + reward: float, + next_state: np.ndarray, + done: bool, + ): + # Use this method to add new data into the replay buffer during fine-tuning. + self._states[self._pointer] = self._to_tensor(state) + self._actions[self._pointer] = self._to_tensor(action) + self._rewards[self._pointer] = self._to_tensor(reward) + self._next_states[self._pointer] = self._to_tensor(next_state) + self._dones[self._pointer] = self._to_tensor(done) + + self._pointer = (self._pointer + 1) % self._buffer_size + self._size = min(self._size + 1, self._buffer_size) + + +def set_env_seed(env: Optional[gym.Env], seed: int): + env.seed(seed) + env.action_space.seed(seed) + + +def set_seed( + seed: int, env: Optional[gym.Env] = None, deterministic_torch: bool = False +): + if env is not None: + set_env_seed(env, seed) + os.environ["PYTHONHASHSEED"] = str(seed) + np.random.seed(seed) + random.seed(seed) + torch.manual_seed(seed) + torch.use_deterministic_algorithms(deterministic_torch) + + +def wandb_init(config: dict) -> None: + wandb.init( + config=config, + project=config["project"], + group=config["group"], + name=config["name"], + id=str(uuid.uuid4()), + ) + wandb.run.save() + + +def is_goal_reached(reward: float, info: Dict) -> bool: + if "goal_achieved" in info: + return info["goal_achieved"] + return reward > 0 # Assuming that reaching target is a positive reward + + +@torch.no_grad() +def eval_actor( + env: gym.Env, actor: nn.Module, device: str, n_episodes: int, seed: int +) -> Tuple[np.ndarray, np.ndarray]: + env.seed(seed) + actor.eval() + episode_rewards = [] + successes = [] + for _ in range(n_episodes): + state, done = env.reset(), False + episode_reward = 0.0 + goal_achieved = False + while not done: + action = actor.act(state, device) + state, reward, done, env_infos = env.step(action) + episode_reward += reward + if not goal_achieved: + goal_achieved = is_goal_reached(reward, env_infos) + # Valid only for environments with goal + successes.append(float(goal_achieved)) + episode_rewards.append(episode_reward) + + actor.train() + return np.asarray(episode_rewards), np.mean(successes) + + +def return_reward_range(dataset: Dict, max_episode_steps: int) -> Tuple[float, float]: + returns, lengths = [], [] + ep_ret, ep_len = 0.0, 0 + for r, d in zip(dataset["rewards"], dataset["terminals"]): + ep_ret += float(r) + ep_len += 1 + if d or ep_len == max_episode_steps: + returns.append(ep_ret) + lengths.append(ep_len) + ep_ret, ep_len = 0.0, 0 + lengths.append(ep_len) # but still keep track of number of steps + assert sum(lengths) == len(dataset["rewards"]) + return min(returns), max(returns) + + +def modify_reward(dataset: Dict, env_name: str, max_episode_steps: int = 1000) -> Dict: + if any(s in env_name for s in ("halfcheetah", "hopper", "walker2d")): + min_ret, max_ret = return_reward_range(dataset, max_episode_steps) + dataset["rewards"] /= max_ret - min_ret + dataset["rewards"] *= max_episode_steps + return { + "max_ret": max_ret, + "min_ret": min_ret, + "max_episode_steps": max_episode_steps, + } + elif "antmaze" in env_name: + dataset["rewards"] -= 1.0 + return {} + + +def modify_reward_online(reward: float, env_name: str, **kwargs) -> float: + if any(s in env_name for s in ("halfcheetah", "hopper", "walker2d")): + reward /= kwargs["max_ret"] - kwargs["min_ret"] + reward *= kwargs["max_episode_steps"] + elif "antmaze" in env_name: + reward -= 1.0 + return reward + + +def weights_init(m: nn.Module, init_w: float = 3e-3): + if isinstance(m, nn.Linear): + m.weight.data.uniform_(-init_w, init_w) + m.bias.data.uniform_(-init_w, init_w) + + +class VAE(nn.Module): + # Vanilla Variational Auto-Encoder + + def __init__( + self, + state_dim: int, + action_dim: int, + latent_dim: int, + max_action: float, + hidden_dim: int = 750, + ): + super(VAE, self).__init__() + if latent_dim is None: + latent_dim = 2 * action_dim + self.encoder_shared = nn.Sequential( + nn.Linear(state_dim + action_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, hidden_dim), + nn.ReLU(), + ) + + self.mean = nn.Linear(hidden_dim, latent_dim) + self.log_std = nn.Linear(hidden_dim, latent_dim) + + self.decoder = nn.Sequential( + nn.Linear(state_dim + latent_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, action_dim), + nn.Tanh(), + ) + + self.max_action = max_action + self.latent_dim = latent_dim + + def forward( + self, + state: torch.Tensor, + action: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + mean, std = self.encode(state, action) + z = mean + std * torch.randn_like(std) + u = self.decode(state, z) + return u, mean, std + + def importance_sampling_estimator( + self, + state: torch.Tensor, + action: torch.Tensor, + beta: float, + num_samples: int = 500, + ) -> torch.Tensor: + # * num_samples correspond to num of samples L in the paper + # * note that for exact value for \hat \log \pi_\beta in the paper + # we also need **an expection over L samples** + mean, std = self.encode(state, action) + + mean_enc = mean.repeat(num_samples, 1, 1).permute(1, 0, 2) # [B x S x D] + std_enc = std.repeat(num_samples, 1, 1).permute(1, 0, 2) # [B x S x D] + z = mean_enc + std_enc * torch.randn_like(std_enc) # [B x S x D] + + state = state.repeat(num_samples, 1, 1).permute(1, 0, 2) # [B x S x C] + action = action.repeat(num_samples, 1, 1).permute(1, 0, 2) # [B x S x C] + mean_dec = self.decode(state, z) + std_dec = np.sqrt(beta / 4) + + # Find q(z|x) + log_qzx = td.Normal(loc=mean_enc, scale=std_enc).log_prob(z) + # Find p(z) + mu_prior = torch.zeros_like(z).to(self.device) + std_prior = torch.ones_like(z).to(self.device) + log_pz = td.Normal(loc=mu_prior, scale=std_prior).log_prob(z) + # Find p(x|z) + std_dec = torch.ones_like(mean_dec).to(self.device) * std_dec + log_pxz = td.Normal(loc=mean_dec, scale=std_dec).log_prob(action) + + w = log_pxz.sum(-1) + log_pz.sum(-1) - log_qzx.sum(-1) + ll = w.logsumexp(dim=-1) - np.log(num_samples) + return ll + + def encode( + self, + state: torch.Tensor, + action: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor]: + z = self.encoder_shared(torch.cat([state, action], -1)) + + mean = self.mean(z) + # Clamped for numerical stability + log_std = self.log_std(z).clamp(-4, 15) + std = torch.exp(log_std) + return mean, std + + def decode( + self, + state: torch.Tensor, + z: torch.Tensor = None, + ) -> torch.Tensor: + # When sampling from the VAE, the latent vector is clipped to [-0.5, 0.5] + if z is None: + z = ( + torch.randn((state.shape[0], self.latent_dim)) + .to(self.device) + .clamp(-0.5, 0.5) + ) + x = torch.cat([state, z], -1) + return self.max_action * self.decoder(x) + + +class Actor(nn.Module): + def __init__( + self, + state_dim: int, + action_dim: int, + max_action: float, + init_w: Optional[float] = None, + ): + super(Actor, self).__init__() + + head = nn.Linear(256, action_dim) + if init_w is not None: + weights_init(head, init_w) + + self.net = nn.Sequential( + nn.Linear(state_dim, 256), + nn.ReLU(), + nn.Linear(256, 256), + nn.ReLU(), + head, + nn.Tanh(), + ) + + self.max_action = max_action + + def forward(self, state: torch.Tensor) -> torch.Tensor: + return self.max_action * self.net(state) + + @torch.no_grad() + def act(self, state: np.ndarray, device: str = "cpu") -> np.ndarray: + state = torch.tensor(state.reshape(1, -1), device=device, dtype=torch.float32) + return self(state).cpu().data.numpy().flatten() + + +class Critic(nn.Module): + def __init__(self, state_dim: int, action_dim: int, init_w: Optional[float] = None): + super(Critic, self).__init__() + + head = nn.Linear(256, 1) + if init_w is not None: + weights_init(head, init_w) + + self.net = nn.Sequential( + nn.Linear(state_dim + action_dim, 256), + nn.ReLU(), + nn.Linear(256, 256), + nn.ReLU(), + head, + ) + + def forward(self, state: torch.Tensor, action: torch.Tensor) -> torch.Tensor: + sa = torch.cat([state, action], 1) + return self.net(sa) + + +class SPOT: + def __init__( + self, + max_action: float, + actor: nn.Module, + actor_optimizer: torch.optim.Optimizer, + critic_1: nn.Module, + critic_1_optimizer: torch.optim.Optimizer, + critic_2: nn.Module, + critic_2_optimizer: torch.optim.Optimizer, + vae: nn.Module, + vae_optimizer: torch.optim.Optimizer, + discount: float = 0.99, + tau: float = 0.005, + policy_noise: float = 0.2, + noise_clip: float = 0.5, + policy_freq: int = 2, + beta: float = 0.5, + lambd: float = 1.0, + num_samples: int = 1, + iwae: bool = False, + lambd_cool: bool = False, + lambd_end: float = 0.2, + max_online_steps: int = 1_000_000, + device: str = "cpu", + ): + self.actor = actor + self.actor_target = copy.deepcopy(actor) + self.actor_optimizer = actor_optimizer + self.critic_1 = critic_1 + self.critic_1_target = copy.deepcopy(critic_1) + self.critic_1_optimizer = critic_1_optimizer + self.critic_2 = critic_2 + self.critic_2_target = copy.deepcopy(critic_2) + self.critic_2_optimizer = critic_2_optimizer + + self.vae = vae + self.vae_optimizer = vae_optimizer + + self.max_action = max_action + self.discount = discount + self.tau = tau + self.policy_noise = policy_noise + self.noise_clip = noise_clip + self.policy_freq = policy_freq + + self.beta = beta + self.lambd = lambd + self.num_samples = num_samples + self.iwae = iwae + self.lambd_cool = lambd_cool + self.lambd_end = lambd_end + self.max_online_steps = max_online_steps + + self.is_online = False + self.online_it = 0 + + self.total_it = 0 + + self.device = device + + def elbo_loss( + self, + state: torch.Tensor, + action: torch.Tensor, + beta: float, + num_samples: int = 1, + ) -> torch.Tensor: + """ + Note: elbo_loss one is proportional to elbo_estimator + i.e. there exist a>0 and b, elbo_loss = a * (-elbo_estimator) + b + """ + mean, std = self.vae.encode(state, action) + + mean_s = mean.repeat(num_samples, 1, 1).permute(1, 0, 2) # [B x S x D] + std_s = std.repeat(num_samples, 1, 1).permute(1, 0, 2) # [B x S x D] + z = mean_s + std_s * torch.randn_like(std_s) + + state = state.repeat(num_samples, 1, 1).permute(1, 0, 2) # [B x S x C] + action = action.repeat(num_samples, 1, 1).permute(1, 0, 2) # [B x S x C] + u = self.vae.decode(state, z) + recon_loss = ((u - action) ** 2).mean(dim=(1, 2)) + + KL_loss = -0.5 * (1 + torch.log(std.pow(2)) - mean.pow(2) - std.pow(2)).mean(-1) + vae_loss = recon_loss + beta * KL_loss + return vae_loss + + def iwae_loss( + self, + state: torch.Tensor, + action: torch.Tensor, + beta: float, + num_samples: int = 10, + ) -> torch.Tensor: + ll = self.vae.importance_sampling_estimator(state, action, beta, num_samples) + return -ll + + def vae_train(self, batch: TensorBatch) -> Dict[str, float]: + log_dict = {} + self.total_it += 1 + + state, action, _, _, _ = batch + # Variational Auto-Encoder Training + recon, mean, std = self.vae(state, action) + recon_loss = F.mse_loss(recon, action) + KL_loss = -0.5 * (1 + torch.log(std.pow(2)) - mean.pow(2) - std.pow(2)).mean() + vae_loss = recon_loss + self.beta * KL_loss + + self.vae_optimizer.zero_grad() + vae_loss.backward() + self.vae_optimizer.step() + + log_dict["VAE/reconstruction_loss"] = recon_loss.item() + log_dict["VAE/KL_loss"] = KL_loss.item() + log_dict["VAE/vae_loss"] = vae_loss.item() + + return log_dict + + def train(self, batch: TensorBatch) -> Dict[str, float]: + log_dict = {} + self.total_it += 1 + if self.is_online: + self.online_it += 1 + + state, action, reward, next_state, done = batch + not_done = 1 - done + + with torch.no_grad(): + # Select action according to actor and add clipped noise + noise = (torch.randn_like(action) * self.policy_noise).clamp( + -self.noise_clip, self.noise_clip + ) + + next_action = (self.actor_target(next_state) + noise).clamp( + -self.max_action, self.max_action + ) + + # Compute the target Q value + target_q1 = self.critic_1_target(next_state, next_action) + target_q2 = self.critic_2_target(next_state, next_action) + target_q = torch.min(target_q1, target_q2) + target_q = reward + not_done * self.discount * target_q + + # Get current Q estimates + current_q1 = self.critic_1(state, action) + current_q2 = self.critic_2(state, action) + + # Compute critic loss + critic_loss = F.mse_loss(current_q1, target_q) + F.mse_loss(current_q2, target_q) + log_dict["critic_loss"] = critic_loss.item() + # Optimize the critic + self.critic_1_optimizer.zero_grad() + self.critic_2_optimizer.zero_grad() + critic_loss.backward() + self.critic_1_optimizer.step() + self.critic_2_optimizer.step() + + # Delayed actor updates + if self.total_it % self.policy_freq == 0: + # Compute actor loss + pi = self.actor(state) + q = self.critic_1(state, pi) + + if self.iwae: + neg_log_beta = self.iwae_loss(state, pi, self.beta, self.num_samples) + else: + neg_log_beta = self.elbo_loss(state, pi, self.beta, self.num_samples) + + if self.lambd_cool: + lambd = self.lambd * max( + self.lambd_end, (1.0 - self.online_it / self.max_online_steps) + ) + else: + lambd = self.lambd + + norm_q = 1 / q.abs().mean().detach() + + actor_loss = -norm_q * q.mean() + lambd * neg_log_beta.mean() + + log_dict["actor_loss"] = actor_loss.item() + log_dict["neg_log_beta_mean"] = neg_log_beta.mean().item() + log_dict["neg_log_beta_max"] = neg_log_beta.max().item() + log_dict["lambd"] = lambd + + # Optimize the actor + self.actor_optimizer.zero_grad() + actor_loss.backward() + self.actor_optimizer.step() + + # Update the frozen target models + soft_update(self.critic_1_target, self.critic_1, self.tau) + soft_update(self.critic_2_target, self.critic_2, self.tau) + soft_update(self.actor_target, self.actor, self.tau) + + return log_dict + + def state_dict(self) -> Dict[str, Any]: + return { + "vae": self.vae.state_dict(), + "vae_optimizer": self.vae_optimizer.state_dict(), + "critic_1": self.critic_1.state_dict(), + "critic_1_optimizer": self.critic_1_optimizer.state_dict(), + "critic_2": self.critic_2.state_dict(), + "critic_2_optimizer": self.critic_2_optimizer.state_dict(), + "actor": self.actor.state_dict(), + "actor_optimizer": self.actor_optimizer.state_dict(), + "total_it": self.total_it, + } + + def load_state_dict(self, state_dict: Dict[str, Any]): + self.vae.load_state_dict(state_dict["vae"]) + self.vae_optimizer.load_state_dict(state_dict["vae_optimizer"]) + + self.critic_1.load_state_dict(state_dict["critic_1"]) + self.critic_1_optimizer.load_state_dict(state_dict["critic_1_optimizer"]) + self.critic_1_target = copy.deepcopy(self.critic_1) + + self.critic_2.load_state_dict(state_dict["critic_2"]) + self.critic_2_optimizer.load_state_dict(state_dict["critic_2_optimizer"]) + self.critic_2_target = copy.deepcopy(self.critic_2) + + self.actor.load_state_dict(state_dict["actor"]) + self.actor_optimizer.load_state_dict(state_dict["actor_optimizer"]) + self.actor_target = copy.deepcopy(self.actor) + + self.total_it = state_dict["total_it"] + + +@pyrallis.wrap() +def train(config: TrainConfig): + env = gym.make(config.env) + eval_env = gym.make(config.env) + + is_env_with_goal = config.env.startswith(ENVS_WITH_GOAL) + + max_steps = env._max_episode_steps + + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.shape[0] + + dataset = d4rl.qlearning_dataset(env) + + reward_mod_dict = {} + if config.normalize_reward: + reward_mod_dict = modify_reward(dataset, config.env) + + if config.normalize: + state_mean, state_std = compute_mean_std(dataset["observations"], eps=1e-3) + else: + state_mean, state_std = 0, 1 + + dataset["observations"] = normalize_states( + dataset["observations"], state_mean, state_std + ) + dataset["next_observations"] = normalize_states( + dataset["next_observations"], state_mean, state_std + ) + env = wrap_env(env, state_mean=state_mean, state_std=state_std) + eval_env = wrap_env(eval_env, state_mean=state_mean, state_std=state_std) + replay_buffer = ReplayBuffer( + state_dim, + action_dim, + config.buffer_size, + config.device, + ) + replay_buffer.load_d4rl_dataset(dataset) + + max_action = float(env.action_space.high[0]) + + if config.checkpoints_path is not None: + print(f"Checkpoints path: {config.checkpoints_path}") + os.makedirs(config.checkpoints_path, exist_ok=True) + with open(os.path.join(config.checkpoints_path, "config.yaml"), "w") as f: + pyrallis.dump(config, f) + + # Set seeds + seed = config.seed + set_seed(seed, env) + set_env_seed(eval_env, config.eval_seed) + + vae = VAE( + state_dim, action_dim, config.vae_latent_dim, max_action, config.vae_hidden_dim + ).to(config.device) + vae_optimizer = torch.optim.Adam(vae.parameters(), lr=config.vae_lr) + + actor = Actor(state_dim, action_dim, max_action, config.actor_init_w).to( + config.device + ) + actor_optimizer = torch.optim.Adam(actor.parameters(), lr=config.actor_lr) + + critic_1 = Critic(state_dim, action_dim, config.critic_init_w).to(config.device) + critic_1_optimizer = torch.optim.Adam(critic_1.parameters(), lr=config.critic_lr) + critic_2 = Critic(state_dim, action_dim, config.critic_init_w).to(config.device) + critic_2_optimizer = torch.optim.Adam(critic_2.parameters(), lr=config.critic_lr) + + kwargs = { + "max_action": max_action, + "vae": vae, + "vae_optimizer": vae_optimizer, + "actor": actor, + "actor_optimizer": actor_optimizer, + "critic_1": critic_1, + "critic_1_optimizer": critic_1_optimizer, + "critic_2": critic_2, + "critic_2_optimizer": critic_2_optimizer, + "discount": config.discount, + "tau": config.tau, + "device": config.device, + # TD3 + "policy_noise": config.policy_noise * max_action, + "noise_clip": config.noise_clip * max_action, + "policy_freq": config.policy_freq, + # SPOT + "beta": config.beta, + "lambd": config.lambd, + "num_samples": config.num_samples, + "iwae": config.iwae, + "lambd_cool": config.lambd_cool, + "lambd_end": config.lambd_end, + "max_online_steps": config.online_iterations, + } + + print("---------------------------------------") + print(f"Training SPOT, Env: {config.env}, Seed: {seed}") + print("---------------------------------------") + + # Initialize actor + trainer = SPOT(**kwargs) + + if config.load_model != "": + policy_file = Path(config.load_model) + trainer.load_state_dict(torch.load(policy_file)) + actor = trainer.actor + + wandb_init(asdict(config)) + evaluations = [] + + print("Training VAE") + for t in range(int(config.vae_iterations)): + batch = replay_buffer.sample(config.batch_size) + batch = [b.to(config.device) for b in batch] + log_dict = trainer.vae_train(batch) + log_dict["vae_iter"] = t + wandb.log(log_dict, step=trainer.total_it) + + vae.eval() + state, done = env.reset(), False + episode_return = 0 + episode_step = 0 + goal_achieved = False + + eval_successes = [] + train_successes = [] + + print("Offline pretraining") + for t in range(int(config.offline_iterations) + int(config.online_iterations)): + if t == config.offline_iterations: + print("Online tuning") + trainer.is_online = True + trainer.discount = config.online_discount + # Resetting optimizers + trainer.actor_optimizer = torch.optim.Adam( + actor.parameters(), lr=config.actor_lr + ) + trainer.critic_1_optimizer = torch.optim.Adam( + critic_1.parameters(), lr=config.critic_lr + ) + trainer.critic_2_optimizer = torch.optim.Adam( + critic_2.parameters(), lr=config.critic_lr + ) + online_log = {} + if t >= config.offline_iterations: + episode_step += 1 + action = actor( + torch.tensor( + state.reshape(1, -1), device=config.device, dtype=torch.float32 + ) + ) + noise = (torch.randn_like(action) * config.expl_noise).clamp( + -config.noise_clip, config.noise_clip + ) + action += noise + action = torch.clamp(max_action * action, -max_action, max_action) + action = action.cpu().data.numpy().flatten() + next_state, reward, done, env_infos = env.step(action) + + if not goal_achieved: + goal_achieved = is_goal_reached(reward, env_infos) + episode_return += reward + real_done = False # Episode can timeout which is different from done + if done and episode_step < max_steps: + real_done = True + + if config.normalize_reward: + reward = modify_reward_online(reward, config.env, **reward_mod_dict) + + replay_buffer.add_transition(state, action, reward, next_state, real_done) + state = next_state + if done: + state, done = env.reset(), False + # Valid only for envs with goal, e.g. AntMaze, Adroit + if is_env_with_goal: + train_successes.append(goal_achieved) + online_log["train/regret"] = np.mean(1 - np.array(train_successes)) + online_log["train/is_success"] = float(goal_achieved) + online_log["train/episode_return"] = episode_return + normalized_return = eval_env.get_normalized_score(episode_return) + online_log["train/d4rl_normalized_episode_return"] = ( + normalized_return * 100.0 + ) + online_log["train/episode_length"] = episode_step + episode_return = 0 + episode_step = 0 + goal_achieved = False + + batch = replay_buffer.sample(config.batch_size) + batch = [b.to(config.device) for b in batch] + log_dict = trainer.train(batch) + log_dict["offline_iter" if t < config.offline_iterations else "online_iter"] = ( + t if t < config.offline_iterations else t - config.offline_iterations + ) + log_dict.update(online_log) + wandb.log(log_dict, step=trainer.total_it) + # Evaluate episode + if (t + 1) % config.eval_freq == 0: + print(f"Time steps: {t + 1}") + eval_scores, success_rate = eval_actor( + eval_env, + actor, + device=config.device, + n_episodes=config.n_episodes, + seed=config.seed, + ) + eval_score = eval_scores.mean() + eval_log = {} + normalized = eval_env.get_normalized_score(np.mean(eval_scores)) + # Valid only for envs with goal, e.g. AntMaze, Adroit + if t >= config.offline_iterations and is_env_with_goal: + eval_successes.append(success_rate) + eval_log["eval/regret"] = np.mean(1 - np.array(train_successes)) + eval_log["eval/success_rate"] = success_rate + normalized_eval_score = normalized * 100.0 + eval_log["eval/d4rl_normalized_score"] = normalized_eval_score + evaluations.append(normalized_eval_score) + print("---------------------------------------") + print( + f"Evaluation over {config.n_episodes} episodes: " + f"{eval_score:.3f} , D4RL score: {normalized_eval_score:.3f}" + ) + print("---------------------------------------") + if config.checkpoints_path is not None: + torch.save( + trainer.state_dict(), + os.path.join(config.checkpoints_path, f"checkpoint_{t}.pt"), + ) + wandb.log(eval_log, step=trainer.total_it) + + +if __name__ == "__main__": + train() diff --git a/algorithms/offline/any_percent_bc.py b/algorithms/offline/any_percent_bc.py index f0ef686b..edacc43e 100644 --- a/algorithms/offline/any_percent_bc.py +++ b/algorithms/offline/any_percent_bc.py @@ -1,9 +1,9 @@ -from typing import Any, Dict, List, Optional, Tuple, Union -from dataclasses import asdict, dataclass import os -from pathlib import Path import random import uuid +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union import d4rl import gym @@ -212,13 +212,12 @@ def keep_best_trajectories( reward_scale = 1.0 sort_ord = np.argsort(returns, axis=0)[::-1].reshape(-1) - top_trajs = sort_ord[: int(frac * len(sort_ord))] + top_trajs = sort_ord[: max(1, int(frac * len(sort_ord)))] order = [] for i in top_trajs: order += ids_by_trajectories[i] order = np.array(order) - dataset["observations"] = dataset["observations"][order] dataset["actions"] = dataset["actions"][order] dataset["next_observations"] = dataset["next_observations"][order] @@ -250,7 +249,7 @@ def act(self, state: np.ndarray, device: str = "cpu") -> np.ndarray: return self(state).cpu().data.numpy().flatten() -class BC: # noqa +class BC: def __init__( self, max_action: np.ndarray, @@ -390,10 +389,13 @@ def train(config: TrainConfig): f"{eval_score:.3f} , D4RL score: {normalized_eval_score:.3f}" ) print("---------------------------------------") - torch.save( - trainer.state_dict(), - os.path.join(config.checkpoints_path, f"checkpoint_{t}.pt"), - ) + + if config.checkpoints_path is not None: + torch.save( + trainer.state_dict(), + os.path.join(config.checkpoints_path, f"checkpoint_{t}.pt"), + ) + wandb.log( {"d4rl_normalized_score": normalized_eval_score}, step=trainer.total_it, diff --git a/algorithms/offline/awac.py b/algorithms/offline/awac.py index e5feadb1..2c652de9 100644 --- a/algorithms/offline/awac.py +++ b/algorithms/offline/awac.py @@ -1,9 +1,9 @@ -from typing import Any, Dict, List, Optional, Tuple, Union -from copy import deepcopy -from dataclasses import asdict, dataclass import os import random import uuid +from copy import deepcopy +from dataclasses import asdict, dataclass +from typing import Any, Dict, List, Optional, Tuple, Union import d4rl import gym @@ -12,8 +12,8 @@ import torch import torch.nn as nn import torch.nn.functional -from tqdm import trange import wandb +from tqdm import trange TensorBatch = List[torch.Tensor] @@ -468,7 +468,7 @@ def train(config: TrainConfig): if hasattr(env, "get_normalized_score"): normalized_eval_scores = env.get_normalized_score(eval_scores) * 100.0 wandb.log( - {"normalized_eval_score": normalized_eval_scores.mean()}, step=t + {"d4rl_normalized_score": normalized_eval_scores.mean()}, step=t ) if config.checkpoints_path is not None: diff --git a/algorithms/offline/cql.py b/algorithms/offline/cql.py index 75cf7c01..a1470eb0 100644 --- a/algorithms/offline/cql.py +++ b/algorithms/offline/cql.py @@ -1,23 +1,22 @@ # source: https://github.com/young-geng/CQL/tree/934b0e8354ca431d6c083c4e3a29df88d4b0a24d -# STRONG UNDER-PERFORMANCE ON PART OF ANTMAZE TASKS. BUT IN IQL PAPER IT WORKS SOMEHOW # https://arxiv.org/pdf/2006.04779.pdf -from typing import Any, Dict, List, Optional, Tuple, Union -from copy import deepcopy -from dataclasses import asdict, dataclass import os -from pathlib import Path import random import uuid +from copy import deepcopy +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union import d4rl import gym import numpy as np import pyrallis import torch -from torch.distributions import Normal, TanhTransform, TransformedDistribution import torch.nn as nn import torch.nn.functional as F import wandb +from torch.distributions import Normal, TanhTransform, TransformedDistribution TensorBatch = List[torch.Tensor] @@ -33,6 +32,7 @@ class TrainConfig: max_timesteps: int = int(1e6) # Max time steps to run environment checkpoints_path: Optional[str] = None # Save path load_model: str = "" # Model load file name, "" doesn't load + # CQL buffer_size: int = 2_000_000 # Replay buffer size batch_size: int = 256 # Batch size for all networks @@ -43,20 +43,29 @@ class TrainConfig: policy_lr: float = 3e-5 # Policy learning rate qf_lr: float = 3e-4 # Critics learning rate soft_target_update_rate: float = 5e-3 # Target network update rate - bc_steps: int = int(0) # Number of BC steps at start target_update_period: int = 1 # Frequency of target nets updates cql_n_actions: int = 10 # Number of sampled actions cql_importance_sample: bool = True # Use importance sampling cql_lagrange: bool = False # Use Lagrange version of CQL cql_target_action_gap: float = -1.0 # Action gap cql_temp: float = 1.0 # CQL temperature - cql_min_q_weight: float = 10.0 # Minimal Q weight + cql_alpha: float = 10.0 # Minimal Q weight cql_max_target_backup: bool = False # Use max target backup cql_clip_diff_min: float = -np.inf # Q-function lower loss clipping cql_clip_diff_max: float = np.inf # Q-function upper loss clipping orthogonal_init: bool = True # Orthogonal initialization normalize: bool = True # Normalize states normalize_reward: bool = False # Normalize reward + q_n_hidden_layers: int = 3 # Number of hidden layers in Q networks + reward_scale: float = 1.0 # Reward scale for normalization + reward_bias: float = 0.0 # Reward bias for normalization + + # AntMaze hacks + bc_steps: int = int(0) # Number of BC steps at start + reward_scale: float = 5.0 + reward_bias: float = -1.0 + policy_log_std_multiplier: float = 1.0 + # Wandb logging project: str = "CORL" group: str = "CQL-D4RL" @@ -211,7 +220,7 @@ def eval_actor( return np.asarray(episode_rewards) -def return_reward_range(dataset, max_episode_steps): +def return_reward_range(dataset: Dict, max_episode_steps: int) -> Tuple[float, float]: returns, lengths = [], [] ep_ret, ep_len = 0.0, 0 for r, d in zip(dataset["rewards"], dataset["terminals"]): @@ -226,26 +235,40 @@ def return_reward_range(dataset, max_episode_steps): return min(returns), max(returns) -def modify_reward(dataset, env_name, max_episode_steps=1000): +def modify_reward( + dataset: Dict, + env_name: str, + max_episode_steps: int = 1000, + reward_scale: float = 1.0, + reward_bias: float = 0.0, +): if any(s in env_name for s in ("halfcheetah", "hopper", "walker2d")): min_ret, max_ret = return_reward_range(dataset, max_episode_steps) dataset["rewards"] /= max_ret - min_ret dataset["rewards"] *= max_episode_steps - elif "antmaze" in env_name: - dataset["rewards"] -= 1.0 + dataset["rewards"] = dataset["rewards"] * reward_scale + reward_bias def extend_and_repeat(tensor: torch.Tensor, dim: int, repeat: int) -> torch.Tensor: return tensor.unsqueeze(dim).repeat_interleave(repeat, dim=dim) -def init_module_weights(module: torch.nn.Module, orthogonal_init: bool = False): - if isinstance(module, nn.Linear): - if orthogonal_init: - nn.init.orthogonal_(module.weight, gain=np.sqrt(2)) - nn.init.constant_(module.bias, 0.0) - else: - nn.init.xavier_uniform_(module.weight, gain=1e-2) +def init_module_weights(module: torch.nn.Sequential, orthogonal_init: bool = False): + # Specific orthgonal initialization for inner layers + # If orthogonal init is off, we do not change default initialization + if orthogonal_init: + for submodule in module[:-1]: + if isinstance(submodule, nn.Linear): + nn.init.orthogonal_(submodule.weight, gain=np.sqrt(2)) + nn.init.constant_(submodule.bias, 0.0) + + # Lasy layers should be initialzied differently as well + if orthogonal_init: + nn.init.orthogonal_(module[-1].weight, gain=1e-2) + else: + nn.init.xavier_uniform_(module[-1].weight, gain=1e-2) + + nn.init.constant_(module[-1].bias, 0.0) class ReparameterizedTanhGaussian(nn.Module): @@ -321,10 +344,7 @@ def __init__( nn.Linear(256, 2 * action_dim), ) - if orthogonal_init: - self.base_network.apply(lambda m: init_module_weights(m, True)) - else: - init_module_weights(self.base_network[-1], False) + init_module_weights(self.base_network) self.log_std_multiplier = Scalar(log_std_multiplier) self.log_std_offset = Scalar(log_std_offset) @@ -338,7 +358,8 @@ def log_prob( base_network_output = self.base_network(observations) mean, log_std = torch.split(base_network_output, self.action_dim, dim=-1) log_std = self.log_std_multiplier() * log_std + self.log_std_offset() - return self.tanh_gaussian.log_prob(mean, log_std, actions) + _, log_probs = self.tanh_gaussian(mean, log_std, False) + return log_probs def forward( self, @@ -368,25 +389,25 @@ def __init__( observation_dim: int, action_dim: int, orthogonal_init: bool = False, + n_hidden_layers: int = 3, ): super().__init__() self.observation_dim = observation_dim self.action_dim = action_dim self.orthogonal_init = orthogonal_init - self.network = nn.Sequential( + layers = [ nn.Linear(observation_dim + action_dim, 256), nn.ReLU(), - nn.Linear(256, 256), - nn.ReLU(), - nn.Linear(256, 256), - nn.ReLU(), - nn.Linear(256, 1), - ) - if orthogonal_init: - self.network.apply(lambda m: init_module_weights(m, True)) - else: - init_module_weights(self.network[-1], False) + ] + for _ in range(n_hidden_layers - 1): + layers.append(nn.Linear(256, 256)) + layers.append(nn.ReLU()) + layers.append(nn.Linear(256, 1)) + + self.network = nn.Sequential(*layers) + + init_module_weights(self.network, orthogonal_init) def forward(self, observations: torch.Tensor, actions: torch.Tensor) -> torch.Tensor: multiple_actions = False @@ -437,7 +458,7 @@ def __init__( cql_lagrange: bool = False, cql_target_action_gap: float = -1.0, cql_temp: float = 1.0, - cql_min_q_weight: float = 5.0, + cql_alpha: float = 5.0, cql_max_target_backup: bool = False, cql_clip_diff_min: float = -np.inf, cql_clip_diff_max: float = np.inf, @@ -460,7 +481,7 @@ def __init__( self.cql_lagrange = cql_lagrange self.cql_target_action_gap = cql_target_action_gap self.cql_temp = cql_temp - self.cql_min_q_weight = cql_min_q_weight + self.cql_alpha = cql_alpha self.cql_max_target_backup = cql_max_target_backup self.cql_clip_diff_min = cql_clip_diff_min self.cql_clip_diff_max = cql_clip_diff_max @@ -532,8 +553,15 @@ def _policy_loss( return policy_loss def _q_loss( - self, observations, actions, next_observations, rewards, dones, alpha, log_dict - ): + self, + observations: torch.Tensor, + actions: torch.Tensor, + next_observations: torch.Tensor, + rewards: torch.Tensor, + dones: torch.Tensor, + alpha: torch.Tensor, + log_dict: Dict, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: q1_predicted = self.critic_1(observations, actions) q2_predicted = self.critic_2(observations, actions) @@ -562,7 +590,7 @@ def _q_loss( target_q_values = target_q_values - alpha * next_log_pi target_q_values = target_q_values.unsqueeze(-1) - td_target = rewards + (1.0 - dones) * self.discount * target_q_values + td_target = rewards + (1.0 - dones) * self.discount * target_q_values.detach() td_target = td_target.squeeze(-1) qf1_loss = F.mse_loss(q1_predicted, td_target.detach()) qf2_loss = F.mse_loss(q2_predicted, td_target.detach()) @@ -655,14 +683,14 @@ def _q_loss( torch.exp(self.log_alpha_prime()), min=0.0, max=1000000.0 ) cql_min_qf1_loss = ( - alpha_prime # noqa - * self.cql_min_q_weight # noqa - * (cql_qf1_diff - self.cql_target_action_gap) # noqa + alpha_prime + * self.cql_alpha + * (cql_qf1_diff - self.cql_target_action_gap) ) cql_min_qf2_loss = ( - alpha_prime # noqa - * self.cql_min_q_weight # noqa - * (cql_qf2_diff - self.cql_target_action_gap) # noqa + alpha_prime + * self.cql_alpha + * (cql_qf2_diff - self.cql_target_action_gap) ) self.alpha_prime_optimizer.zero_grad() @@ -670,8 +698,8 @@ def _q_loss( alpha_prime_loss.backward(retain_graph=True) self.alpha_prime_optimizer.step() else: - cql_min_qf1_loss = cql_qf1_diff * self.cql_min_q_weight - cql_min_qf2_loss = cql_qf2_diff * self.cql_min_q_weight + cql_min_qf1_loss = cql_qf1_diff * self.cql_alpha + cql_min_qf2_loss = cql_qf2_diff * self.cql_alpha alpha_prime_loss = observations.new_tensor(0.0) alpha_prime = observations.new_tensor(0.0) @@ -815,7 +843,12 @@ def train(config: TrainConfig): dataset = d4rl.qlearning_dataset(env) if config.normalize_reward: - modify_reward(dataset, config.env) + modify_reward( + dataset, + config.env, + reward_scale=config.reward_scale, + reward_bias=config.reward_bias, + ) if config.normalize: state_mean, state_std = compute_mean_std(dataset["observations"], eps=1e-3) @@ -849,9 +882,12 @@ def train(config: TrainConfig): seed = config.seed set_seed(seed, env) - critic_1 = FullyConnectedQFunction(state_dim, action_dim, config.orthogonal_init).to( - config.device - ) + critic_1 = FullyConnectedQFunction( + state_dim, + action_dim, + config.orthogonal_init, + config.q_n_hidden_layers, + ).to(config.device) critic_2 = FullyConnectedQFunction(state_dim, action_dim, config.orthogonal_init).to( config.device ) @@ -859,7 +895,11 @@ def train(config: TrainConfig): critic_2_optimizer = torch.optim.Adam(list(critic_2.parameters()), config.qf_lr) actor = TanhGaussianPolicy( - state_dim, action_dim, max_action, orthogonal_init=config.orthogonal_init + state_dim, + action_dim, + max_action, + log_std_multiplier=config.policy_log_std_multiplier, + orthogonal_init=config.orthogonal_init, ).to(config.device) actor_optimizer = torch.optim.Adam(actor.parameters(), config.policy_lr) @@ -887,7 +927,7 @@ def train(config: TrainConfig): "cql_lagrange": config.cql_lagrange, "cql_target_action_gap": config.cql_target_action_gap, "cql_temp": config.cql_temp, - "cql_min_q_weight": config.cql_min_q_weight, + "cql_alpha": config.cql_alpha, "cql_max_target_backup": config.cql_max_target_backup, "cql_clip_diff_min": config.cql_clip_diff_min, "cql_clip_diff_max": config.cql_clip_diff_max, diff --git a/algorithms/offline/dt.py b/algorithms/offline/dt.py index d5c2e6e7..37c61e67 100644 --- a/algorithms/offline/dt.py +++ b/algorithms/offline/dt.py @@ -1,24 +1,23 @@ # inspiration: # 1. https://github.com/kzl/decision-transformer/blob/master/gym/decision_transformer/models/decision_transformer.py # noqa # 2. https://github.com/karpathy/minGPT -from typing import Any, DefaultDict, Dict, List, Optional, Tuple, Union -from collections import defaultdict -from dataclasses import asdict, dataclass import os import random import uuid +from collections import defaultdict +from dataclasses import asdict, dataclass +from typing import Any, DefaultDict, Dict, List, Optional, Tuple, Union import d4rl # noqa -import gym # noqa +import gym import numpy as np import pyrallis import torch import torch.nn as nn -from torch.nn import functional as F # noqa +import wandb +from torch.nn import functional as F from torch.utils.data import DataLoader, IterableDataset from tqdm.auto import tqdm, trange # noqa -import wandb - @dataclass class TrainConfig: @@ -381,10 +380,10 @@ def eval_rollout( # step + 1 as : operator is not inclusive, last action is dummy with zeros # (as model will predict last, actual last values are not important) predicted_actions = model( # fix this noqa!!! - states[:, : step + 1][:, -model.seq_len :], # noqa - actions[:, : step + 1][:, -model.seq_len :], # noqa - returns[:, : step + 1][:, -model.seq_len :], # noqa - time_steps[:, : step + 1][:, -model.seq_len :], # noqa + states[:, : step + 1][:, -model.seq_len :], + actions[:, : step + 1][:, -model.seq_len :], + returns[:, : step + 1][:, -model.seq_len :], + time_steps[:, : step + 1][:, -model.seq_len :], ) predicted_action = predicted_actions[0, -1].cpu().numpy() next_state, reward, done, info = env.step(predicted_action) diff --git a/algorithms/offline/edac.py b/algorithms/offline/edac.py index f8686905..413801c9 100644 --- a/algorithms/offline/edac.py +++ b/algorithms/offline/edac.py @@ -1,24 +1,23 @@ # Inspired by: # 1. paper for SAC-N: https://arxiv.org/abs/2110.01548 # 2. implementation: https://github.com/snu-mllab/EDAC -from typing import Any, Dict, List, Optional, Tuple, Union -from copy import deepcopy -from dataclasses import asdict, dataclass import math import os import random import uuid +from copy import deepcopy +from dataclasses import asdict, dataclass +from typing import Any, Dict, List, Optional, Tuple, Union import d4rl import gym import numpy as np import pyrallis import torch -from torch.distributions import Normal import torch.nn as nn -from tqdm import trange import wandb - +from torch.distributions import Normal +from tqdm import trange @dataclass class TrainConfig: @@ -313,7 +312,7 @@ def __init__( tau: float = 0.005, eta: float = 1.0, alpha_learning_rate: float = 1e-4, - device: str = "cpu", # noqa + device: str = "cpu", ): self.device = device diff --git a/algorithms/offline/iql.py b/algorithms/offline/iql.py index 87517773..257184f5 100644 --- a/algorithms/offline/iql.py +++ b/algorithms/offline/iql.py @@ -1,29 +1,29 @@ # source: https://github.com/gwthomas/IQL-PyTorch # https://arxiv.org/pdf/2110.06169.pdf -from typing import Any, Callable, Dict, List, Optional, Tuple, Union import copy -from dataclasses import asdict, dataclass import os -from pathlib import Path import random import uuid +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import d4rl import gym import numpy as np import pyrallis import torch -from torch.distributions import MultivariateNormal import torch.nn as nn import torch.nn.functional as F -from torch.optim.lr_scheduler import CosineAnnealingLR import wandb +from torch.distributions import Normal +from torch.optim.lr_scheduler import CosineAnnealingLR TensorBatch = List[torch.Tensor] EXP_ADV_MAX = 100.0 -LOG_STD_MIN = -5.0 +LOG_STD_MIN = -20.0 LOG_STD_MAX = 2.0 @@ -48,6 +48,10 @@ class TrainConfig: iql_deterministic: bool = False # Use deterministic actor normalize: bool = True # Normalize states normalize_reward: bool = False # Normalize reward + vf_lr: float = 3e-4 # V function learning rate + qf_lr: float = 3e-4 # Critic learning rate + actor_lr: float = 3e-4 # Actor learning rate + actor_dropout: Optional[float] = None # Adroit uses dropout for policy network # Wandb logging project: str = "CORL" group: str = "IQL-D4RL" @@ -246,6 +250,7 @@ def __init__( activation_fn: Callable[[], nn.Module] = nn.ReLU, output_activation_fn: Callable[[], nn.Module] = None, squeeze_output: bool = False, + dropout: Optional[float] = None, ): super().__init__() n_dims = len(dims) @@ -256,6 +261,10 @@ def __init__( for i in range(n_dims - 2): layers.append(nn.Linear(dims[i], dims[i + 1])) layers.append(activation_fn()) + + if dropout is not None: + layers.append(nn.Dropout(dropout)) + layers.append(nn.Linear(dims[-2], dims[-1])) if output_activation_fn is not None: layers.append(output_activation_fn()) @@ -277,6 +286,7 @@ def __init__( max_action: float, hidden_dim: int = 256, n_hidden: int = 2, + dropout: Optional[float] = None, ): super().__init__() self.net = MLP( @@ -286,11 +296,10 @@ def __init__( self.log_std = nn.Parameter(torch.zeros(act_dim, dtype=torch.float32)) self.max_action = max_action - def forward(self, obs: torch.Tensor) -> MultivariateNormal: + def forward(self, obs: torch.Tensor) -> Normal: mean = self.net(obs) std = torch.exp(self.log_std.clamp(LOG_STD_MIN, LOG_STD_MAX)) - scale_tril = torch.diag(std) - return MultivariateNormal(mean, scale_tril=scale_tril) + return Normal(mean, std) @torch.no_grad() def act(self, state: np.ndarray, device: str = "cpu"): @@ -309,11 +318,13 @@ def __init__( max_action: float, hidden_dim: int = 256, n_hidden: int = 2, + dropout: Optional[float] = None, ): super().__init__() self.net = MLP( [state_dim, *([hidden_dim] * n_hidden), act_dim], output_activation_fn=nn.Tanh, + dropout=dropout, ) self.max_action = max_action @@ -403,36 +414,42 @@ def _update_v(self, observations, actions, log_dict) -> torch.Tensor: adv = target_q - v v_loss = asymmetric_l2_loss(adv, self.iql_tau) log_dict["value_loss"] = v_loss.item() - self.v_optimizer.zero_grad(set_to_none=True) + self.v_optimizer.zero_grad() v_loss.backward() self.v_optimizer.step() return adv def _update_q( self, - next_v, - observations, - actions, - rewards, - terminals, - log_dict, + next_v: torch.Tensor, + observations: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + terminals: torch.Tensor, + log_dict: Dict, ): targets = rewards + (1.0 - terminals.float()) * self.discount * next_v.detach() qs = self.qf.both(observations, actions) q_loss = sum(F.mse_loss(q, targets) for q in qs) / len(qs) log_dict["q_loss"] = q_loss.item() - self.q_optimizer.zero_grad(set_to_none=True) + self.q_optimizer.zero_grad() q_loss.backward() self.q_optimizer.step() # Update target Q network soft_update(self.q_target, self.qf, self.tau) - def _update_policy(self, adv, observations, actions, log_dict): + def _update_policy( + self, + adv: torch.Tensor, + observations: torch.Tensor, + actions: torch.Tensor, + log_dict: Dict, + ): exp_adv = torch.exp(self.beta * adv.detach()).clamp(max=EXP_ADV_MAX) policy_out = self.actor(observations) if isinstance(policy_out, torch.distributions.Distribution): - bc_losses = -policy_out.log_prob(actions) + bc_losses = -policy_out.log_prob(actions).sum(-1, keepdim=False) elif torch.is_tensor(policy_out): if policy_out.shape != actions.shape: raise RuntimeError("Actions shape missmatch") @@ -441,7 +458,7 @@ def _update_policy(self, adv, observations, actions, log_dict): raise NotImplementedError policy_loss = torch.mean(exp_adv * bc_losses) log_dict["actor_loss"] = policy_loss.item() - self.actor_optimizer.zero_grad(set_to_none=True) + self.actor_optimizer.zero_grad() policy_loss.backward() self.actor_optimizer.step() self.actor_lr_schedule.step() @@ -544,13 +561,17 @@ def train(config: TrainConfig): q_network = TwinQ(state_dim, action_dim).to(config.device) v_network = ValueFunction(state_dim).to(config.device) actor = ( - DeterministicPolicy(state_dim, action_dim, max_action) + DeterministicPolicy( + state_dim, action_dim, max_action, dropout=config.actor_dropout + ) if config.iql_deterministic - else GaussianPolicy(state_dim, action_dim, max_action) + else GaussianPolicy( + state_dim, action_dim, max_action, dropout=config.actor_dropout + ) ).to(config.device) - v_optimizer = torch.optim.Adam(v_network.parameters(), lr=3e-4) - q_optimizer = torch.optim.Adam(q_network.parameters(), lr=3e-4) - actor_optimizer = torch.optim.Adam(actor.parameters(), lr=3e-4) + v_optimizer = torch.optim.Adam(v_network.parameters(), lr=config.vf_lr) + q_optimizer = torch.optim.Adam(q_network.parameters(), lr=config.qf_lr) + actor_optimizer = torch.optim.Adam(actor.parameters(), lr=config.actor_lr) kwargs = { "max_action": max_action, diff --git a/algorithms/offline/lb_sac.py b/algorithms/offline/lb_sac.py index efe8c66a..71fb8c54 100644 --- a/algorithms/offline/lb_sac.py +++ b/algorithms/offline/lb_sac.py @@ -1,24 +1,23 @@ # Inspired by: # 1. paper for LB-SAC: https://arxiv.org/abs/2211.11092 # 2. implementation: https://github.com/tinkoff-ai/lb-sac -from typing import Any, Dict, List, Optional, Tuple, Union -from copy import deepcopy -from dataclasses import asdict, dataclass import math import os import random import uuid +from copy import deepcopy +from dataclasses import asdict, dataclass +from typing import Any, Dict, List, Optional, Tuple, Union import d4rl import gym import numpy as np import pyrallis import torch -from torch.distributions import Normal import torch.nn as nn -from tqdm import trange import wandb - +from torch.distributions import Normal +from tqdm import trange # base batch size: 256 # base learning rate: 3e-4 @@ -328,7 +327,7 @@ def __init__( gamma: float = 0.99, tau: float = 0.005, alpha_learning_rate: float = 1e-4, - device: str = "cpu", # noqa + device: str = "cpu", ): self.device = device diff --git a/algorithms/offline/rebrac.py b/algorithms/offline/rebrac.py new file mode 100644 index 00000000..52b48ce2 --- /dev/null +++ b/algorithms/offline/rebrac.py @@ -0,0 +1,760 @@ +# source: https://github.com/tinkoff-ai/ReBRAC +# https://arxiv.org/abs/2305.09836 + +import os + +os.environ["TF_CUDNN_DETERMINISTIC"] = "1" # For reproducibility + +import math +import uuid +from copy import deepcopy +from dataclasses import asdict, dataclass +from functools import partial +from typing import Any, Callable, Dict, Sequence, Tuple, Union + +import chex +import d4rl # noqa +import flax.linen as nn +import gym +import jax +import jax.numpy as jnp +import numpy as np +import optax +import pyrallis +import wandb +from flax.core import FrozenDict +from flax.training.train_state import TrainState +from tqdm.auto import trange + +default_kernel_init = nn.initializers.lecun_normal() +default_bias_init = nn.initializers.zeros + + +@dataclass +class Config: + # wandb params + project: str = "CORL" + group: str = "rebrac" + name: str = "rebrac" + # model params + actor_learning_rate: float = 1e-3 + critic_learning_rate: float = 1e-3 + hidden_dim: int = 256 + actor_n_hiddens: int = 3 + critic_n_hiddens: int = 3 + gamma: float = 0.99 + tau: float = 5e-3 + actor_bc_coef: float = 1.0 + critic_bc_coef: float = 1.0 + actor_ln: bool = False + critic_ln: bool = True + policy_noise: float = 0.2 + noise_clip: float = 0.5 + policy_freq: int = 2 + normalize_q: bool = True + # training params + dataset_name: str = "halfcheetah-medium-v2" + batch_size: int = 1024 + num_epochs: int = 1000 + num_updates_on_epoch: int = 1000 + normalize_reward: bool = False + normalize_states: bool = False + # evaluation params + eval_episodes: int = 10 + eval_every: int = 5 + # general params + train_seed: int = 0 + eval_seed: int = 42 + + def __post_init__(self): + self.name = f"{self.name}-{self.dataset_name}-{str(uuid.uuid4())[:8]}" + + +def pytorch_init(fan_in: float) -> Callable: + """ + Default init for PyTorch Linear layer weights and biases: + https://pytorch.org/docs/stable/generated/torch.nn.Linear.html + """ + bound = math.sqrt(1 / fan_in) + + def _init(key: jax.random.PRNGKey, shape: Tuple, dtype: type) -> jax.Array: + return jax.random.uniform( + key, shape=shape, minval=-bound, maxval=bound, dtype=dtype + ) + + return _init + + +def uniform_init(bound: float) -> Callable: + def _init(key: jax.random.PRNGKey, shape: Tuple, dtype: type) -> jax.Array: + return jax.random.uniform( + key, shape=shape, minval=-bound, maxval=bound, dtype=dtype + ) + + return _init + + +def identity(x: Any) -> Any: + return x + + +class DetActor(nn.Module): + action_dim: int + hidden_dim: int = 256 + layernorm: bool = True + n_hiddens: int = 3 + + @nn.compact + def __call__(self, state: jax.Array) -> jax.Array: + s_d, h_d = state.shape[-1], self.hidden_dim + # Initialization as in the EDAC paper + layers = [ + nn.Dense( + self.hidden_dim, + kernel_init=pytorch_init(s_d), + bias_init=nn.initializers.constant(0.1), + ), + nn.relu, + nn.LayerNorm() if self.layernorm else identity, + ] + for _ in range(self.n_hiddens - 1): + layers += [ + nn.Dense( + self.hidden_dim, + kernel_init=pytorch_init(h_d), + bias_init=nn.initializers.constant(0.1), + ), + nn.relu, + nn.LayerNorm() if self.layernorm else identity, + ] + layers += [ + nn.Dense( + self.action_dim, + kernel_init=uniform_init(1e-3), + bias_init=uniform_init(1e-3), + ), + nn.tanh, + ] + net = nn.Sequential(layers) + actions = net(state) + return actions + + +class Critic(nn.Module): + hidden_dim: int = 256 + layernorm: bool = True + n_hiddens: int = 3 + + @nn.compact + def __call__(self, state: jax.Array, action: jax.Array) -> jax.Array: + s_d, a_d, h_d = state.shape[-1], action.shape[-1], self.hidden_dim + # Initialization as in the EDAC paper + layers = [ + nn.Dense( + self.hidden_dim, + kernel_init=pytorch_init(s_d + a_d), + bias_init=nn.initializers.constant(0.1), + ), + nn.relu, + nn.LayerNorm() if self.layernorm else identity, + ] + for _ in range(self.n_hiddens - 1): + layers += [ + nn.Dense( + self.hidden_dim, + kernel_init=pytorch_init(h_d), + bias_init=nn.initializers.constant(0.1), + ), + nn.relu, + nn.LayerNorm() if self.layernorm else identity, + ] + layers += [ + nn.Dense(1, kernel_init=uniform_init(3e-3), bias_init=uniform_init(3e-3)) + ] + network = nn.Sequential(layers) + state_action = jnp.hstack([state, action]) + out = network(state_action).squeeze(-1) + return out + + +class EnsembleCritic(nn.Module): + hidden_dim: int = 256 + num_critics: int = 10 + layernorm: bool = True + n_hiddens: int = 3 + + @nn.compact + def __call__(self, state: jax.Array, action: jax.Array) -> jax.Array: + ensemble = nn.vmap( + target=Critic, + in_axes=None, + out_axes=0, + variable_axes={"params": 0}, + split_rngs={"params": True}, + axis_size=self.num_critics, + ) + q_values = ensemble(self.hidden_dim, self.layernorm, self.n_hiddens)( + state, action + ) + return q_values + + +def qlearning_dataset( + env: gym.Env, + dataset: Dict = None, + terminate_on_end: bool = False, + **kwargs, +) -> Dict: + if dataset is None: + dataset = env.get_dataset(**kwargs) + + N = dataset["rewards"].shape[0] + obs_ = [] + next_obs_ = [] + action_ = [] + next_action_ = [] + reward_ = [] + done_ = [] + + # The newer version of the dataset adds an explicit + # timeouts field. Keep old method for backwards compatability. + use_timeouts = "timeouts" in dataset + + episode_step = 0 + for i in range(N - 1): + obs = dataset["observations"][i].astype(np.float32) + new_obs = dataset["observations"][i + 1].astype(np.float32) + action = dataset["actions"][i].astype(np.float32) + new_action = dataset["actions"][i + 1].astype(np.float32) + reward = dataset["rewards"][i].astype(np.float32) + done_bool = bool(dataset["terminals"][i]) + + if use_timeouts: + final_timestep = dataset["timeouts"][i] + else: + final_timestep = episode_step == env._max_episode_steps - 1 + if (not terminate_on_end) and final_timestep: + # Skip this transition + episode_step = 0 + continue + if done_bool or final_timestep: + episode_step = 0 + + obs_.append(obs) + next_obs_.append(new_obs) + action_.append(action) + next_action_.append(new_action) + reward_.append(reward) + done_.append(done_bool) + episode_step += 1 + + return { + "observations": np.array(obs_), + "actions": np.array(action_), + "next_observations": np.array(next_obs_), + "next_actions": np.array(next_action_), + "rewards": np.array(reward_), + "terminals": np.array(done_), + } + + +def compute_mean_std(states: jax.Array, eps: float) -> Tuple[jax.Array, jax.Array]: + mean = states.mean(0) + std = states.std(0) + eps + return mean, std + + +def normalize_states(states: jax.Array, mean: jax.Array, std: jax.Array) -> jax.Array: + return (states - mean) / std + + +@chex.dataclass +class ReplayBuffer: + data: Dict[str, jax.Array] = None + mean: float = 0 + std: float = 1 + + def create_from_d4rl( + self, + dataset_name: str, + normalize_reward: bool = False, + is_normalize: bool = False, + ): + d4rl_data = qlearning_dataset(gym.make(dataset_name)) + buffer = { + "states": jnp.asarray(d4rl_data["observations"], dtype=jnp.float32), + "actions": jnp.asarray(d4rl_data["actions"], dtype=jnp.float32), + "rewards": jnp.asarray(d4rl_data["rewards"], dtype=jnp.float32), + "next_states": jnp.asarray( + d4rl_data["next_observations"], dtype=jnp.float32 + ), + "next_actions": jnp.asarray(d4rl_data["next_actions"], dtype=jnp.float32), + "dones": jnp.asarray(d4rl_data["terminals"], dtype=jnp.float32), + } + if is_normalize: + self.mean, self.std = compute_mean_std(buffer["states"], eps=1e-3) + buffer["states"] = normalize_states(buffer["states"], self.mean, self.std) + buffer["next_states"] = normalize_states( + buffer["next_states"], self.mean, self.std + ) + if normalize_reward: + buffer["rewards"] = ReplayBuffer.normalize_reward( + dataset_name, buffer["rewards"] + ) + self.data = buffer + + @property + def size(self) -> int: + # WARN: It will use len of the dataclass, i.e. number of fields. + return self.data["states"].shape[0] + + def sample_batch( + self, key: jax.random.PRNGKey, batch_size: int + ) -> Dict[str, jax.Array]: + indices = jax.random.randint( + key, shape=(batch_size,), minval=0, maxval=self.size + ) + batch = jax.tree_map(lambda arr: arr[indices], self.data) + return batch + + def get_moments(self, modality: str) -> Tuple[jax.Array, jax.Array]: + mean = self.data[modality].mean(0) + std = self.data[modality].std(0) + return mean, std + + @staticmethod + def normalize_reward(dataset_name: str, rewards: jax.Array) -> jax.Array: + if "antmaze" in dataset_name: + return rewards * 100.0 # like in LAPO + else: + raise NotImplementedError( + "Reward normalization is implemented only for AntMaze yet!" + ) + + +@chex.dataclass(frozen=True) +class Metrics: + accumulators: Dict[str, Tuple[jax.Array, jax.Array]] + + @staticmethod + def create(metrics: Sequence[str]) -> "Metrics": + init_metrics = {key: (jnp.array([0.0]), jnp.array([0.0])) for key in metrics} + return Metrics(accumulators=init_metrics) + + def update(self, updates: Dict[str, jax.Array]) -> "Metrics": + new_accumulators = deepcopy(self.accumulators) + for key, value in updates.items(): + acc, steps = new_accumulators[key] + new_accumulators[key] = (acc + value, steps + 1) + + return self.replace(accumulators=new_accumulators) + + def compute(self) -> Dict[str, np.ndarray]: + # cumulative_value / total_steps + return {k: np.array(v[0] / v[1]) for k, v in self.accumulators.items()} + + +def normalize( + arr: jax.Array, mean: jax.Array, std: jax.Array, eps: float = 1e-8 +) -> jax.Array: + return (arr - mean) / (std + eps) + + +def make_env(env_name: str, seed: int) -> gym.Env: + env = gym.make(env_name) + env.seed(seed) + env.action_space.seed(seed) + env.observation_space.seed(seed) + return env + + +def wrap_env( + env: gym.Env, + state_mean: Union[np.ndarray, float] = 0.0, + state_std: Union[np.ndarray, float] = 1.0, + reward_scale: float = 1.0, +) -> gym.Env: + # PEP 8: E731 do not assign a lambda expression, use a def + def normalize_state(state: np.ndarray) -> np.ndarray: + return ( + state - state_mean + ) / state_std # epsilon should be already added in std. + + def scale_reward(reward: float) -> float: + # Please be careful, here reward is multiplied by scale! + return reward_scale * reward + + env = gym.wrappers.TransformObservation(env, normalize_state) + if reward_scale != 1.0: + env = gym.wrappers.TransformReward(env, scale_reward) + return env + + +def evaluate( + env: gym.Env, + params: jax.Array, + action_fn: Callable, + num_episodes: int, + seed: int, +) -> np.ndarray: + env.seed(seed) + env.action_space.seed(seed) + env.observation_space.seed(seed) + + returns = [] + for _ in trange(num_episodes, desc="Eval", leave=False): + obs, done = env.reset(), False + total_reward = 0.0 + while not done: + action = np.asarray(jax.device_get(action_fn(params, obs))) + obs, reward, done, _ = env.step(action) + total_reward += reward + returns.append(total_reward) + + return np.array(returns) + + +class CriticTrainState(TrainState): + target_params: FrozenDict + + +class ActorTrainState(TrainState): + target_params: FrozenDict + + +def update_actor( + key: jax.random.PRNGKey, + actor: TrainState, + critic: TrainState, + batch: Dict[str, jax.Array], + beta: float, + tau: float, + normalize_q: bool, + metrics: Metrics, +) -> Tuple[jax.random.PRNGKey, TrainState, TrainState, Metrics]: + key, random_action_key = jax.random.split(key, 2) + + def actor_loss_fn(params: jax.Array) -> Tuple[jax.Array, Metrics]: + actions = actor.apply_fn(params, batch["states"]) + + bc_penalty = ((actions - batch["actions"]) ** 2).sum(-1) + q_values = critic.apply_fn(critic.params, batch["states"], actions).min(0) + lmbda = 1 + if normalize_q: + lmbda = jax.lax.stop_gradient(1 / jax.numpy.abs(q_values).mean()) + + loss = (beta * bc_penalty - lmbda * q_values).mean() + + # logging stuff + random_actions = jax.random.uniform( + random_action_key, shape=batch["actions"].shape, minval=-1.0, maxval=1.0 + ) + new_metrics = metrics.update( + { + "actor_loss": loss, + "bc_mse_policy": bc_penalty.mean(), + "bc_mse_random": ((random_actions - batch["actions"]) ** 2) + .sum(-1) + .mean(), + "action_mse": ((actions - batch["actions"]) ** 2).mean(), + } + ) + return loss, new_metrics + + grads, new_metrics = jax.grad(actor_loss_fn, has_aux=True)(actor.params) + new_actor = actor.apply_gradients(grads=grads) + + new_actor = new_actor.replace( + target_params=optax.incremental_update(actor.params, actor.target_params, tau) + ) + new_critic = critic.replace( + target_params=optax.incremental_update(critic.params, critic.target_params, tau) + ) + + return key, new_actor, new_critic, new_metrics + + +def update_critic( + key: jax.random.PRNGKey, + actor: TrainState, + critic: CriticTrainState, + batch: Dict[str, jax.Array], + gamma: float, + beta: float, + tau: float, + policy_noise: float, + noise_clip: float, + metrics: Metrics, +) -> Tuple[jax.random.PRNGKey, TrainState, Metrics]: + key, actions_key = jax.random.split(key) + + next_actions = actor.apply_fn(actor.target_params, batch["next_states"]) + noise = jax.numpy.clip( + (jax.random.normal(actions_key, next_actions.shape) * policy_noise), + -noise_clip, + noise_clip, + ) + next_actions = jax.numpy.clip(next_actions + noise, -1, 1) + bc_penalty = ((next_actions - batch["next_actions"]) ** 2).sum(-1) + next_q = critic.apply_fn( + critic.target_params, batch["next_states"], next_actions + ).min(0) + next_q = next_q - beta * bc_penalty + + target_q = batch["rewards"] + (1 - batch["dones"]) * gamma * next_q + + def critic_loss_fn(critic_params: jax.Array) -> Tuple[jax.Array, jax.Array]: + # [N, batch_size] - [1, batch_size] + q = critic.apply_fn(critic_params, batch["states"], batch["actions"]) + q_min = q.min(0).mean() + loss = ((q - target_q[None, ...]) ** 2).mean(1).sum(0) + return loss, q_min + + (loss, q_min), grads = jax.value_and_grad(critic_loss_fn, has_aux=True)( + critic.params + ) + new_critic = critic.apply_gradients(grads=grads) + new_metrics = metrics.update( + { + "critic_loss": loss, + "q_min": q_min, + } + ) + return key, new_critic, new_metrics + + +def update_td3( + key: jax.random.PRNGKey, + actor: TrainState, + critic: CriticTrainState, + batch: Dict[str, Any], + metrics: Metrics, + gamma: float, + actor_bc_coef: float, + critic_bc_coef: float, + tau: float, + policy_noise: float, + noise_clip: float, + normalize_q: bool, +) -> Tuple[jax.random.PRNGKey, TrainState, TrainState, Metrics]: + key, new_critic, new_metrics = update_critic( + key, + actor, + critic, + batch, + gamma, + critic_bc_coef, + tau, + policy_noise, + noise_clip, + metrics, + ) + key, new_actor, new_critic, new_metrics = update_actor( + key, actor, new_critic, batch, actor_bc_coef, tau, normalize_q, new_metrics + ) + return key, new_actor, new_critic, new_metrics + + +def update_td3_no_targets( + key: jax.random.PRNGKey, + actor: TrainState, + critic: CriticTrainState, + batch: Dict[str, Any], + gamma: float, + metrics: Metrics, + actor_bc_coef: float, + critic_bc_coef: float, + tau: float, + policy_noise: float, + noise_clip: float, +) -> Tuple[jax.random.PRNGKey, TrainState, TrainState, Metrics]: + key, new_critic, new_metrics = update_critic( + key, + actor, + critic, + batch, + gamma, + critic_bc_coef, + tau, + policy_noise, + noise_clip, + metrics, + ) + return key, actor, new_critic, new_metrics + + +def action_fn(actor: TrainState) -> Callable: + @jax.jit + def _action_fn(obs: jax.Array) -> jax.Array: + action = actor.apply_fn(actor.params, obs) + return action + + return _action_fn + + +@pyrallis.wrap() +def main(config: Config): + dict_config = asdict(config) + dict_config["mlc_job_name"] = os.environ.get("PLATFORM_JOB_NAME") + + wandb.init( + config=dict_config, + project=config.project, + group=config.group, + name=config.name, + id=str(uuid.uuid4()), + ) + wandb.mark_preempting() + buffer = ReplayBuffer() + buffer.create_from_d4rl( + config.dataset_name, config.normalize_reward, config.normalize_states + ) + + key = jax.random.PRNGKey(seed=config.train_seed) + key, actor_key, critic_key = jax.random.split(key, 3) + + eval_env = make_env(config.dataset_name, seed=config.eval_seed) + eval_env = wrap_env(eval_env, buffer.mean, buffer.std) + init_state = buffer.data["states"][0][None, ...] + init_action = buffer.data["actions"][0][None, ...] + + actor_module = DetActor( + action_dim=init_action.shape[-1], + hidden_dim=config.hidden_dim, + layernorm=config.actor_ln, + n_hiddens=config.actor_n_hiddens, + ) + actor = ActorTrainState.create( + apply_fn=actor_module.apply, + params=actor_module.init(actor_key, init_state), + target_params=actor_module.init(actor_key, init_state), + tx=optax.adam(learning_rate=config.actor_learning_rate), + ) + + critic_module = EnsembleCritic( + hidden_dim=config.hidden_dim, + num_critics=2, + layernorm=config.critic_ln, + n_hiddens=config.critic_n_hiddens, + ) + critic = CriticTrainState.create( + apply_fn=critic_module.apply, + params=critic_module.init(critic_key, init_state, init_action), + target_params=critic_module.init(critic_key, init_state, init_action), + tx=optax.adam(learning_rate=config.critic_learning_rate), + ) + + update_td3_partial = partial( + update_td3, + gamma=config.gamma, + actor_bc_coef=config.actor_bc_coef, + critic_bc_coef=config.critic_bc_coef, + tau=config.tau, + policy_noise=config.policy_noise, + noise_clip=config.noise_clip, + normalize_q=config.normalize_q, + ) + + update_td3_no_targets_partial = partial( + update_td3_no_targets, + gamma=config.gamma, + actor_bc_coef=config.actor_bc_coef, + critic_bc_coef=config.critic_bc_coef, + tau=config.tau, + policy_noise=config.policy_noise, + noise_clip=config.noise_clip, + ) + + def td3_loop_update_step(i: int, carry: TrainState): + key, batch_key = jax.random.split(carry["key"]) + batch = carry["buffer"].sample_batch(batch_key, batch_size=config.batch_size) + + full_update = partial( + update_td3_partial, + key=key, + actor=carry["actor"], + critic=carry["critic"], + batch=batch, + metrics=carry["metrics"], + ) + + update = partial( + update_td3_no_targets_partial, + key=key, + actor=carry["actor"], + critic=carry["critic"], + batch=batch, + metrics=carry["metrics"], + ) + + key, new_actor, new_critic, new_metrics = jax.lax.cond( + update_carry["delayed_updates"][i], full_update, update + ) + + carry.update(key=key, actor=new_actor, critic=new_critic, metrics=new_metrics) + return carry + + # metrics + bc_metrics_to_log = [ + "critic_loss", + "q_min", + "actor_loss", + "batch_entropy", + "bc_mse_policy", + "bc_mse_random", + "action_mse", + ] + # shared carry for update loops + update_carry = { + "key": key, + "actor": actor, + "critic": critic, + "buffer": buffer, + "delayed_updates": jax.numpy.equal( + jax.numpy.arange(config.num_updates_on_epoch) % config.policy_freq, 0 + ).astype(int), + } + + @jax.jit + def actor_action_fn(params: jax.Array, obs: jax.Array): + return actor.apply_fn(params, obs) + + for epoch in trange(config.num_epochs, desc="ReBRAC Epochs"): + # metrics for accumulation during epoch and logging to wandb + # we need to reset them every epoch + update_carry["metrics"] = Metrics.create(bc_metrics_to_log) + + update_carry = jax.lax.fori_loop( + lower=0, + upper=config.num_updates_on_epoch, + body_fun=td3_loop_update_step, + init_val=update_carry, + ) + # log mean over epoch for each metric + mean_metrics = update_carry["metrics"].compute() + wandb.log( + {"epoch": epoch, **{f"ReBRAC/{k}": v for k, v in mean_metrics.items()}} + ) + + if epoch % config.eval_every == 0 or epoch == config.num_epochs - 1: + eval_returns = evaluate( + eval_env, + update_carry["actor"].params, + actor_action_fn, + config.eval_episodes, + seed=config.eval_seed, + ) + normalized_score = eval_env.get_normalized_score(eval_returns) * 100.0 + wandb.log( + { + "epoch": epoch, + "eval/return_mean": np.mean(eval_returns), + "eval/return_std": np.std(eval_returns), + "eval/normalized_score_mean": np.mean(normalized_score), + "eval/normalized_score_std": np.std(normalized_score), + } + ) + + +if __name__ == "__main__": + main() diff --git a/algorithms/offline/sac_n.py b/algorithms/offline/sac_n.py index 8d9874b0..0b91ddec 100644 --- a/algorithms/offline/sac_n.py +++ b/algorithms/offline/sac_n.py @@ -1,24 +1,23 @@ # Inspired by: # 1. paper for SAC-N: https://arxiv.org/abs/2110.01548 # 2. implementation: https://github.com/snu-mllab/EDAC -from typing import Any, Dict, List, Optional, Tuple, Union -from copy import deepcopy -from dataclasses import asdict, dataclass import math import os import random import uuid +from copy import deepcopy +from dataclasses import asdict, dataclass +from typing import Any, Dict, List, Optional, Tuple, Union import d4rl import gym import numpy as np import pyrallis import torch -from torch.distributions import Normal import torch.nn as nn -from tqdm import trange import wandb - +from torch.distributions import Normal +from tqdm import trange @dataclass class TrainConfig: @@ -308,7 +307,7 @@ def __init__( gamma: float = 0.99, tau: float = 0.005, alpha_learning_rate: float = 1e-4, - device: str = "cpu", # noqa + device: str = "cpu", ): self.device = device diff --git a/algorithms/offline/td3_bc.py b/algorithms/offline/td3_bc.py index 7c1d6b8d..a78bda30 100644 --- a/algorithms/offline/td3_bc.py +++ b/algorithms/offline/td3_bc.py @@ -1,12 +1,12 @@ # source: https://github.com/sfujim/TD3_BC # https://arxiv.org/pdf/2106.06860.pdf -from typing import Any, Dict, List, Optional, Tuple, Union import copy -from dataclasses import asdict, dataclass import os -from pathlib import Path import random import uuid +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union import d4rl import gym @@ -263,7 +263,7 @@ def forward(self, state: torch.Tensor, action: torch.Tensor) -> torch.Tensor: return self.net(sa) -class TD3_BC: # noqa +class TD3_BC: def __init__( self, max_action: float, @@ -497,10 +497,13 @@ def train(config: TrainConfig): f"{eval_score:.3f} , D4RL score: {normalized_eval_score:.3f}" ) print("---------------------------------------") - torch.save( - trainer.state_dict(), - os.path.join(config.checkpoints_path, f"checkpoint_{t}.pt"), - ) + + if config.checkpoints_path is not None: + torch.save( + trainer.state_dict(), + os.path.join(config.checkpoints_path, f"checkpoint_{t}.pt"), + ) + wandb.log( {"d4rl_normalized_score": normalized_eval_score}, step=trainer.total_it, diff --git a/configs/finetune/awac/antmaze/large_diverse_v2.yaml b/configs/finetune/awac/antmaze/large_diverse_v2.yaml new file mode 100644 index 00000000..0b9888c1 --- /dev/null +++ b/configs/finetune/awac/antmaze/large_diverse_v2.yaml @@ -0,0 +1,20 @@ +awac_lambda: 0.1 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +deterministic_torch: false +device: cuda +env_name: antmaze-large-diverse-v2 +eval_frequency: 50000 +gamma: 0.99 +group: awac-antmaze-large-diverse-v2-multiseed-v0 +hidden_dim: 256 +learning_rate: 0.0003 +n_test_episodes: 100 +normalize_reward: true +offline_iterations: 1000000 +online_iterations: 1000000 +project: CORL +seed: 42 +tau: 0.005 +test_seed: 69 \ No newline at end of file diff --git a/configs/finetune/awac/antmaze/large_play_v2.yaml b/configs/finetune/awac/antmaze/large_play_v2.yaml new file mode 100644 index 00000000..3b5dab49 --- /dev/null +++ b/configs/finetune/awac/antmaze/large_play_v2.yaml @@ -0,0 +1,20 @@ +awac_lambda: 0.1 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +deterministic_torch: false +device: cuda +env_name: antmaze-large-play-v2 +eval_frequency: 50000 +gamma: 0.99 +group: awac-antmaze-large-play-v2-multiseed-v0 +hidden_dim: 256 +learning_rate: 0.0003 +n_test_episodes: 100 +normalize_reward: true +offline_iterations: 1000000 +online_iterations: 1000000 +project: CORL +seed: 42 +tau: 0.005 +test_seed: 69 \ No newline at end of file diff --git a/configs/finetune/awac/antmaze/medium_diverse_v2.yaml b/configs/finetune/awac/antmaze/medium_diverse_v2.yaml new file mode 100644 index 00000000..44ca49d4 --- /dev/null +++ b/configs/finetune/awac/antmaze/medium_diverse_v2.yaml @@ -0,0 +1,20 @@ +awac_lambda: 0.1 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +deterministic_torch: false +device: cuda +env_name: antmaze-medium-diverse-v2 +eval_frequency: 50000 +gamma: 0.99 +group: awac-antmaze-medium-diverse-v2-multiseed-v0 +hidden_dim: 256 +learning_rate: 0.0003 +n_test_episodes: 100 +normalize_reward: true +offline_iterations: 1000000 +online_iterations: 1000000 +project: CORL +seed: 42 +tau: 0.005 +test_seed: 69 \ No newline at end of file diff --git a/configs/finetune/awac/antmaze/medium_play_v2.yaml b/configs/finetune/awac/antmaze/medium_play_v2.yaml new file mode 100644 index 00000000..9c120c99 --- /dev/null +++ b/configs/finetune/awac/antmaze/medium_play_v2.yaml @@ -0,0 +1,20 @@ +awac_lambda: 0.1 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +deterministic_torch: false +device: cuda +env_name: antmaze-medium-play-v2 +eval_frequency: 50000 +gamma: 0.99 +group: awac-antmaze-medium-play-v2-multiseed-v0 +hidden_dim: 256 +learning_rate: 0.0003 +n_test_episodes: 100 +normalize_reward: true +offline_iterations: 1000000 +online_iterations: 1000000 +project: CORL +seed: 42 +tau: 0.005 +test_seed: 69 \ No newline at end of file diff --git a/configs/finetune/awac/antmaze/umaze_diverse_v2.yaml b/configs/finetune/awac/antmaze/umaze_diverse_v2.yaml new file mode 100644 index 00000000..b214f381 --- /dev/null +++ b/configs/finetune/awac/antmaze/umaze_diverse_v2.yaml @@ -0,0 +1,20 @@ +awac_lambda: 0.1 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +deterministic_torch: false +device: cuda +env_name: antmaze-umaze-diverse-v2 +eval_frequency: 50000 +gamma: 0.99 +group: awac-antmaze-umaze-diverse-v2-multiseed-v0 +hidden_dim: 256 +learning_rate: 0.0003 +n_test_episodes: 100 +normalize_reward: true +offline_iterations: 1000000 +online_iterations: 1000000 +project: CORL +seed: 42 +tau: 0.005 +test_seed: 69 \ No newline at end of file diff --git a/configs/finetune/awac/antmaze/umaze_v2.yaml b/configs/finetune/awac/antmaze/umaze_v2.yaml new file mode 100644 index 00000000..08b258ef --- /dev/null +++ b/configs/finetune/awac/antmaze/umaze_v2.yaml @@ -0,0 +1,20 @@ +awac_lambda: 0.1 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +deterministic_torch: false +device: cuda +env_name: antmaze-umaze-v2 +eval_frequency: 50000 +gamma: 0.99 +group: awac-antmaze-umaze-v2-multiseed-v0 +hidden_dim: 256 +learning_rate: 0.0003 +n_test_episodes: 100 +normalize_reward: true +offline_iterations: 1000000 +online_iterations: 1000000 +project: CORL +seed: 42 +tau: 0.005 +test_seed: 69 \ No newline at end of file diff --git a/configs/finetune/awac/door/cloned_v1.yaml b/configs/finetune/awac/door/cloned_v1.yaml new file mode 100644 index 00000000..1f79d0b9 --- /dev/null +++ b/configs/finetune/awac/door/cloned_v1.yaml @@ -0,0 +1,20 @@ +awac_lambda: 0.1 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +deterministic_torch: false +device: cuda +env_name: door-cloned-v1 +eval_frequency: 5000 +gamma: 0.99 +group: awac-door-cloned-v1-multiseed-v0 +hidden_dim: 256 +learning_rate: 0.0003 +n_test_episodes: 10 +normalize_reward: false +offline_iterations: 1000000 +online_iterations: 1000000 +project: CORL +seed: 42 +tau: 0.005 +test_seed: 69 \ No newline at end of file diff --git a/configs/finetune/awac/hammer/cloned_v1.yaml b/configs/finetune/awac/hammer/cloned_v1.yaml new file mode 100644 index 00000000..ff67eab9 --- /dev/null +++ b/configs/finetune/awac/hammer/cloned_v1.yaml @@ -0,0 +1,20 @@ +awac_lambda: 0.1 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +deterministic_torch: false +device: cuda +env_name: hammer-cloned-v1 +eval_frequency: 5000 +gamma: 0.99 +group: awac-hammer-cloned-v1-multiseed-v0 +hidden_dim: 256 +learning_rate: 0.0003 +n_test_episodes: 10 +normalize_reward: false +offline_iterations: 1000000 +online_iterations: 1000000 +project: CORL +seed: 42 +tau: 0.005 +test_seed: 69 \ No newline at end of file diff --git a/configs/finetune/awac/pen/cloned_v1.yaml b/configs/finetune/awac/pen/cloned_v1.yaml new file mode 100644 index 00000000..6ee6972d --- /dev/null +++ b/configs/finetune/awac/pen/cloned_v1.yaml @@ -0,0 +1,20 @@ +awac_lambda: 0.1 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +deterministic_torch: false +device: cuda +env_name: pen-cloned-v1 +eval_frequency: 5000 +gamma: 0.99 +group: awac-pen-cloned-v1-multiseed-v0 +hidden_dim: 256 +learning_rate: 0.0003 +n_test_episodes: 10 +normalize_reward: false +offline_iterations: 1000000 +online_iterations: 1000000 +project: CORL +seed: 42 +tau: 0.005 +test_seed: 69 \ No newline at end of file diff --git a/configs/finetune/awac/relocate/cloned_v1.yaml b/configs/finetune/awac/relocate/cloned_v1.yaml new file mode 100644 index 00000000..36a87a58 --- /dev/null +++ b/configs/finetune/awac/relocate/cloned_v1.yaml @@ -0,0 +1,20 @@ +awac_lambda: 0.1 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +deterministic_torch: false +device: cuda +env_name: relocate-cloned-v1 +eval_frequency: 5000 +gamma: 0.99 +group: awac-relocate-cloned-v1-multiseed-v0 +hidden_dim: 256 +learning_rate: 0.0003 +n_test_episodes: 10 +normalize_reward: false +offline_iterations: 1000000 +online_iterations: 1000000 +project: CORL +seed: 42 +tau: 0.005 +test_seed: 69 \ No newline at end of file diff --git a/configs/finetune/cal_ql/antmaze/large_diverse_v2.yaml b/configs/finetune/cal_ql/antmaze/large_diverse_v2.yaml new file mode 100644 index 00000000..eda25d68 --- /dev/null +++ b/configs/finetune/cal_ql/antmaze/large_diverse_v2.yaml @@ -0,0 +1,41 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_alpha: 5.0 +cql_alpha_online: 5.0 +cql_clip_diff_max: .inf +cql_clip_diff_min: -200 +cql_importance_sample: true +cql_lagrange: true +cql_max_target_backup: true +cql_n_actions: 10 +cql_target_action_gap: 0.8 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: antmaze-large-diverse-v2 +eval_freq: 50000 +group: cql-antmaze-large-diverse-v2-multiseed-v0 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +mixing_ratio: 0.5 +n_episodes: 100 +name: CQL +normalize: false +normalize_reward: true +orthogonal_init: true +policy_lr: 0.0001 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 5 +reward_scale: 10.0 +reward_bias: -5.0 +use_automatic_entropy_tuning: true +is_sparse_reward: true diff --git a/configs/finetune/cal_ql/antmaze/large_play_v2.yaml b/configs/finetune/cal_ql/antmaze/large_play_v2.yaml new file mode 100644 index 00000000..05667be5 --- /dev/null +++ b/configs/finetune/cal_ql/antmaze/large_play_v2.yaml @@ -0,0 +1,41 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_alpha: 5.0 +cql_alpha_online: 5.0 +cql_clip_diff_max: .inf +cql_clip_diff_min: -200 +cql_importance_sample: true +cql_lagrange: true +cql_max_target_backup: true +cql_n_actions: 10 +cql_target_action_gap: 0.8 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: antmaze-large-play-v2 +eval_freq: 50000 +group: cql-antmaze-large-play-v2-multiseed-v0 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +mixing_ratio: 0.5 +n_episodes: 100 +name: CQL +normalize: false +normalize_reward: true +orthogonal_init: true +policy_lr: 0.0001 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 5 +reward_scale: 10.0 +reward_bias: -5.0 +use_automatic_entropy_tuning: true +is_sparse_reward: true diff --git a/configs/finetune/cal_ql/antmaze/medium_diverse_v2.yaml b/configs/finetune/cal_ql/antmaze/medium_diverse_v2.yaml new file mode 100644 index 00000000..08872070 --- /dev/null +++ b/configs/finetune/cal_ql/antmaze/medium_diverse_v2.yaml @@ -0,0 +1,41 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_alpha: 5.0 +cql_alpha_online: 5.0 +cql_clip_diff_max: .inf +cql_clip_diff_min: -200 +cql_importance_sample: true +cql_lagrange: true +cql_max_target_backup: true +cql_n_actions: 10 +cql_target_action_gap: 0.8 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: antmaze-medium-diverse-v2 +eval_freq: 50000 +group: cql-antmaze-medium-diverse-v2-multiseed-v0 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +mixing_ratio: 0.5 +n_episodes: 100 +name: CQL +normalize: false +normalize_reward: true +orthogonal_init: true +policy_lr: 0.0001 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 5 +reward_scale: 10.0 +reward_bias: -5.0 +use_automatic_entropy_tuning: true +is_sparse_reward: true diff --git a/configs/finetune/cal_ql/antmaze/medium_play_v2.yaml b/configs/finetune/cal_ql/antmaze/medium_play_v2.yaml new file mode 100644 index 00000000..00264d14 --- /dev/null +++ b/configs/finetune/cal_ql/antmaze/medium_play_v2.yaml @@ -0,0 +1,41 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_alpha: 5.0 +cql_alpha_online: 5.0 +cql_clip_diff_max: .inf +cql_clip_diff_min: -200 +cql_importance_sample: true +cql_lagrange: true +cql_max_target_backup: true +cql_n_actions: 10 +cql_target_action_gap: 0.8 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: antmaze-medium-play-v2 +eval_freq: 50000 +group: cql-antmaze-medium-play-v2-multiseed-v0 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +mixing_ratio: 0.5 +n_episodes: 100 +name: CQL +normalize: false +normalize_reward: true +orthogonal_init: true +policy_lr: 0.0001 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 5 +reward_scale: 10.0 +reward_bias: -5.0 +use_automatic_entropy_tuning: true +is_sparse_reward: true diff --git a/configs/finetune/cal_ql/antmaze/umaze_diverse_v2.yaml b/configs/finetune/cal_ql/antmaze/umaze_diverse_v2.yaml new file mode 100644 index 00000000..8452e8fb --- /dev/null +++ b/configs/finetune/cal_ql/antmaze/umaze_diverse_v2.yaml @@ -0,0 +1,41 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_alpha: 5.0 +cql_alpha_online: 5.0 +cql_clip_diff_max: .inf +cql_clip_diff_min: -200 +cql_importance_sample: true +cql_lagrange: true +cql_max_target_backup: true +cql_n_actions: 10 +cql_target_action_gap: 0.8 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: antmaze-umaze-diverse-v2 +eval_freq: 50000 +group: cql-antmaze-umaze-diverse-v2-multiseed-v0 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +mixing_ratio: 0.5 +n_episodes: 100 +name: CQL +normalize: false +normalize_reward: true +orthogonal_init: true +policy_lr: 0.0001 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 5 +reward_scale: 10.0 +reward_bias: -5.0 +use_automatic_entropy_tuning: true +is_sparse_reward: true diff --git a/configs/finetune/cal_ql/antmaze/umaze_v2.yaml b/configs/finetune/cal_ql/antmaze/umaze_v2.yaml new file mode 100644 index 00000000..19ed1f12 --- /dev/null +++ b/configs/finetune/cal_ql/antmaze/umaze_v2.yaml @@ -0,0 +1,41 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_alpha: 5.0 +cql_alpha_online: 5.0 +cql_clip_diff_max: .inf +cql_clip_diff_min: -200 +cql_importance_sample: true +cql_lagrange: true +cql_max_target_backup: true +cql_n_actions: 10 +cql_target_action_gap: 0.8 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: antmaze-umaze-v2 +eval_freq: 50000 +group: cql-antmaze-umaze-v2-multiseed-v0 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +mixing_ratio: 0.5 +n_episodes: 100 +name: CQL +normalize: false +normalize_reward: true +orthogonal_init: true +policy_lr: 0.0001 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 5 +reward_scale: 10.0 +reward_bias: -5.0 +use_automatic_entropy_tuning: true +is_sparse_reward: true diff --git a/configs/finetune/cal_ql/door/cloned_v1.yaml b/configs/finetune/cal_ql/door/cloned_v1.yaml new file mode 100644 index 00000000..d61d02e5 --- /dev/null +++ b/configs/finetune/cal_ql/door/cloned_v1.yaml @@ -0,0 +1,41 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_alpha: 1.0 +cql_alpha_online: 1.0 +cql_clip_diff_max: .inf +cql_clip_diff_min: -200 +cql_importance_sample: true +cql_lagrange: false +cql_max_target_backup: true +cql_n_actions: 10 +cql_target_action_gap: 0.8 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: door-cloned-v1 +eval_freq: 5000 +group: cql-door-cloned-v1-multiseed-v0 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +mixing_ratio: 0.5 +n_episodes: 10 +name: CQL +normalize: false +normalize_reward: false +orthogonal_init: true +policy_lr: 0.0001 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 +use_automatic_entropy_tuning: true +is_sparse_reward: false \ No newline at end of file diff --git a/configs/finetune/cal_ql/hammer/cloned_v1.yaml b/configs/finetune/cal_ql/hammer/cloned_v1.yaml new file mode 100644 index 00000000..4ed087bc --- /dev/null +++ b/configs/finetune/cal_ql/hammer/cloned_v1.yaml @@ -0,0 +1,41 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_alpha: 1.0 +cql_alpha_online: 1.0 +cql_clip_diff_max: .inf +cql_clip_diff_min: -200 +cql_importance_sample: true +cql_lagrange: false +cql_max_target_backup: true +cql_n_actions: 10 +cql_target_action_gap: 0.8 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: hammer-cloned-v1 +eval_freq: 5000 +group: cql-hammer-cloned-v1-multiseed-v0 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +mixing_ratio: 0.5 +n_episodes: 10 +name: CQL +normalize: false +normalize_reward: false +orthogonal_init: true +policy_lr: 0.0001 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 +use_automatic_entropy_tuning: true +is_sparse_reward: false \ No newline at end of file diff --git a/configs/finetune/cal_ql/pen/cloned_v1.yaml b/configs/finetune/cal_ql/pen/cloned_v1.yaml new file mode 100644 index 00000000..20ff3115 --- /dev/null +++ b/configs/finetune/cal_ql/pen/cloned_v1.yaml @@ -0,0 +1,41 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_alpha: 1.0 +cql_alpha_online: 1.0 +cql_clip_diff_max: .inf +cql_clip_diff_min: -200 +cql_importance_sample: true +cql_lagrange: false +cql_max_target_backup: true +cql_n_actions: 10 +cql_target_action_gap: 0.8 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: pen-cloned-v1 +eval_freq: 5000 +group: cql-pen-cloned-v1-multiseed-v0 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +mixing_ratio: 0.5 +n_episodes: 10 +name: CQL +normalize: false +normalize_reward: false +orthogonal_init: true +policy_lr: 0.0001 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 +use_automatic_entropy_tuning: true +is_sparse_reward: false \ No newline at end of file diff --git a/configs/finetune/cal_ql/relocate/cloned_v1.yaml b/configs/finetune/cal_ql/relocate/cloned_v1.yaml new file mode 100644 index 00000000..c519967c --- /dev/null +++ b/configs/finetune/cal_ql/relocate/cloned_v1.yaml @@ -0,0 +1,41 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_alpha: 1.0 +cql_alpha_online: 1.0 +cql_clip_diff_max: .inf +cql_clip_diff_min: -200 +cql_importance_sample: true +cql_lagrange: false +cql_max_target_backup: true +cql_n_actions: 10 +cql_target_action_gap: 0.8 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: relocate-cloned-v1 +eval_freq: 5000 +group: cql-relocate-cloned-v1-multiseed-v0 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +mixing_ratio: 0.5 +n_episodes: 10 +name: CQL +normalize: false +normalize_reward: false +orthogonal_init: true +policy_lr: 0.0001 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 +use_automatic_entropy_tuning: true +is_sparse_reward: false \ No newline at end of file diff --git a/configs/finetune/cql/antmaze/large_diverse_v2.yaml b/configs/finetune/cql/antmaze/large_diverse_v2.yaml new file mode 100644 index 00000000..44e617e8 --- /dev/null +++ b/configs/finetune/cql/antmaze/large_diverse_v2.yaml @@ -0,0 +1,39 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_alpha: 5.0 +cql_alpha_online: 5.0 +cql_clip_diff_max: .inf +cql_clip_diff_min: -200 +cql_importance_sample: true +cql_lagrange: true +cql_max_target_backup: true +cql_n_actions: 10 +cql_target_action_gap: 0.8 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: antmaze-large-diverse-v2 +eval_freq: 50000 +group: cql-antmaze-large-diverse-v2-multiseed-v0 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +n_episodes: 100 +name: CQL +normalize: false +normalize_reward: true +orthogonal_init: true +policy_lr: 0.0001 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 5 +reward_scale: 10.0 +reward_bias: -5.0 +use_automatic_entropy_tuning: true diff --git a/configs/finetune/cql/antmaze/large_play_v2.yaml b/configs/finetune/cql/antmaze/large_play_v2.yaml new file mode 100644 index 00000000..279a31c1 --- /dev/null +++ b/configs/finetune/cql/antmaze/large_play_v2.yaml @@ -0,0 +1,39 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_clip_diff_max: .inf +cql_clip_diff_min: -200 +cql_importance_sample: true +cql_lagrange: true +cql_max_target_backup: true +cql_alpha: 5.0 +cql_alpha_online: 5.0 +cql_n_actions: 10 +cql_target_action_gap: 0.8 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: antmaze-large-play-v2 +eval_freq: 50000 +group: cql-antmaze-large-play-v2-multiseed-v0 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +n_episodes: 100 +name: CQL +normalize: false +normalize_reward: true +orthogonal_init: true +policy_lr: 0.0001 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 5 +reward_scale: 10.0 +reward_bias: -5.0 +use_automatic_entropy_tuning: true diff --git a/configs/finetune/cql/antmaze/medium_diverse_v2.yaml b/configs/finetune/cql/antmaze/medium_diverse_v2.yaml new file mode 100644 index 00000000..6b70cd59 --- /dev/null +++ b/configs/finetune/cql/antmaze/medium_diverse_v2.yaml @@ -0,0 +1,39 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_clip_diff_max: .inf +cql_clip_diff_min: -200 +cql_importance_sample: true +cql_lagrange: true +cql_max_target_backup: true +cql_alpha: 5.0 +cql_alpha_online: 5.0 +cql_n_actions: 10 +cql_target_action_gap: 0.8 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: antmaze-medium-diverse-v2 +eval_freq: 50000 +group: cql-antmaze-medium-diverse-v2-multiseed-v0 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +n_episodes: 100 +name: CQL +normalize: false +normalize_reward: true +orthogonal_init: true +policy_lr: 0.0001 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 5 +reward_scale: 10.0 +reward_bias: -5.0 +use_automatic_entropy_tuning: true diff --git a/configs/finetune/cql/antmaze/medium_play_v2.yaml b/configs/finetune/cql/antmaze/medium_play_v2.yaml new file mode 100644 index 00000000..eef98298 --- /dev/null +++ b/configs/finetune/cql/antmaze/medium_play_v2.yaml @@ -0,0 +1,39 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_clip_diff_max: .inf +cql_clip_diff_min: -200 +cql_importance_sample: true +cql_lagrange: true +cql_max_target_backup: true +cql_alpha: 5.0 +cql_alpha_online: 5.0 +cql_n_actions: 10 +cql_target_action_gap: 0.8 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: antmaze-medium-play-v2 +eval_freq: 50000 +group: cql-antmaze-medium-play-v2-multiseed-v0 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +n_episodes: 100 +name: CQL +normalize: false +normalize_reward: true +orthogonal_init: true +policy_lr: 0.0001 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 5 +reward_scale: 10.0 +reward_bias: -5.0 +use_automatic_entropy_tuning: true diff --git a/configs/finetune/cql/antmaze/umaze_diverse_v2.yaml b/configs/finetune/cql/antmaze/umaze_diverse_v2.yaml new file mode 100644 index 00000000..1052c28c --- /dev/null +++ b/configs/finetune/cql/antmaze/umaze_diverse_v2.yaml @@ -0,0 +1,39 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_clip_diff_max: .inf +cql_clip_diff_min: -200 +cql_importance_sample: true +cql_lagrange: true +cql_max_target_backup: true +cql_alpha: 5.0 +cql_alpha_online: 5.0 +cql_n_actions: 10 +cql_target_action_gap: 0.8 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: antmaze-umaze-diverse-v2 +eval_freq: 50000 +group: cql-antmaze-umaze-diverse-v2-multiseed-v0 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +n_episodes: 100 +name: CQL +normalize: false +normalize_reward: true +orthogonal_init: true +policy_lr: 0.0001 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 5 +reward_scale: 10.0 +reward_bias: -5.0 +use_automatic_entropy_tuning: true diff --git a/configs/finetune/cql/antmaze/umaze_v2.yaml b/configs/finetune/cql/antmaze/umaze_v2.yaml new file mode 100644 index 00000000..b555784c --- /dev/null +++ b/configs/finetune/cql/antmaze/umaze_v2.yaml @@ -0,0 +1,39 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_clip_diff_max: .inf +cql_clip_diff_min: -200 +cql_importance_sample: true +cql_lagrange: true +cql_max_target_backup: true +cql_alpha: 5.0 +cql_alpha_online: 5.0 +cql_n_actions: 10 +cql_target_action_gap: 0.8 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: antmaze-umaze-v2 +eval_freq: 50000 +group: cql-antmaze-umaze-v2-multiseed-v0 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +n_episodes: 100 +name: CQL +normalize: false +normalize_reward: true +orthogonal_init: true +policy_lr: 0.0001 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 5 +reward_scale: 10.0 +reward_bias: -5.0 +use_automatic_entropy_tuning: true diff --git a/configs/finetune/cql/door/cloned_v1.yaml b/configs/finetune/cql/door/cloned_v1.yaml new file mode 100644 index 00000000..bec68d62 --- /dev/null +++ b/configs/finetune/cql/door/cloned_v1.yaml @@ -0,0 +1,39 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_clip_diff_max: .inf +cql_clip_diff_min: -200 +cql_importance_sample: true +cql_lagrange: false +cql_max_target_backup: true +cql_alpha: 1.0 +cql_alpha_online: 1.0 +cql_n_actions: 10 +cql_target_action_gap: 0.8 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: door-cloned-v1 +eval_freq: 5000 +group: cql-door-cloned-v1-multiseed-v0 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +n_episodes: 10 +name: CQL +normalize: false +normalize_reward: false +orthogonal_init: true +policy_lr: 0.0001 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 +use_automatic_entropy_tuning: true diff --git a/configs/finetune/cql/hammer/cloned_v1.yaml b/configs/finetune/cql/hammer/cloned_v1.yaml new file mode 100644 index 00000000..5680757a --- /dev/null +++ b/configs/finetune/cql/hammer/cloned_v1.yaml @@ -0,0 +1,39 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_clip_diff_max: .inf +cql_clip_diff_min: -200 +cql_importance_sample: true +cql_lagrange: false +cql_max_target_backup: true +cql_alpha: 1.0 +cql_alpha_online: 1.0 +cql_n_actions: 10 +cql_target_action_gap: 0.8 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: hammer-cloned-v1 +eval_freq: 5000 +group: cql-hammer-cloned-v1-multiseed-v0 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +n_episodes: 10 +name: CQL +normalize: false +normalize_reward: false +orthogonal_init: true +policy_lr: 0.0001 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 +use_automatic_entropy_tuning: true diff --git a/configs/finetune/cql/pen/cloned_v1.yaml b/configs/finetune/cql/pen/cloned_v1.yaml new file mode 100644 index 00000000..c2bcaa81 --- /dev/null +++ b/configs/finetune/cql/pen/cloned_v1.yaml @@ -0,0 +1,39 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_clip_diff_max: .inf +cql_clip_diff_min: -200 +cql_importance_sample: true +cql_lagrange: false +cql_max_target_backup: true +cql_alpha: 1.0 +cql_alpha_online: 1.0 +cql_n_actions: 10 +cql_target_action_gap: 0.8 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: pen-cloned-v1 +eval_freq: 5000 +group: cql-pen-cloned-v1-multiseed-v0 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +n_episodes: 10 +name: CQL +normalize: false +normalize_reward: false +orthogonal_init: true +policy_lr: 0.0001 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 +use_automatic_entropy_tuning: true diff --git a/configs/finetune/cql/relocate/cloned_v1.yaml b/configs/finetune/cql/relocate/cloned_v1.yaml new file mode 100644 index 00000000..9fbb93de --- /dev/null +++ b/configs/finetune/cql/relocate/cloned_v1.yaml @@ -0,0 +1,39 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_clip_diff_max: .inf +cql_clip_diff_min: -200 +cql_importance_sample: true +cql_lagrange: false +cql_max_target_backup: true +cql_alpha: 1.0 +cql_alpha_online: 1.0 +cql_n_actions: 10 +cql_target_action_gap: 0.8 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: relocate-cloned-v1 +eval_freq: 5000 +group: cql-relocate-cloned-v1-multiseed-v0 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +n_episodes: 10 +name: CQL +normalize: false +normalize_reward: false +orthogonal_init: true +policy_lr: 0.0001 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 +use_automatic_entropy_tuning: true diff --git a/configs/finetune/iql/antmaze/large_diverse_v2.yaml b/configs/finetune/iql/antmaze/large_diverse_v2.yaml new file mode 100644 index 00000000..11250ee2 --- /dev/null +++ b/configs/finetune/iql/antmaze/large_diverse_v2.yaml @@ -0,0 +1,24 @@ +actor_lr: 3e-4 +batch_size: 256 +beta: 10.0 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: antmaze-large-diverse-v2 +eval_freq: 50000 +group: IQL-D4RL +iql_deterministic: false +iql_tau: 0.9 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +n_episodes: 100 +name: IQL_antmaze-large-diverse-v2 +normalize: true +normalize_reward: true +qf_lr: 3e-4 +project: CORL +seed: 0 +tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/finetune/iql/antmaze/large_play_v2.yaml b/configs/finetune/iql/antmaze/large_play_v2.yaml new file mode 100644 index 00000000..97643a61 --- /dev/null +++ b/configs/finetune/iql/antmaze/large_play_v2.yaml @@ -0,0 +1,24 @@ +actor_lr: 3e-4 +batch_size: 256 +beta: 10.0 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: antmaze-large-play-v2 +eval_freq: 50000 +group: IQL-D4RL +iql_deterministic: false +iql_tau: 0.9 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +n_episodes: 100 +name: IQL_antmaze-large-play-v2 +normalize: true +normalize_reward: true +qf_lr: 3e-4 +project: CORL +seed: 0 +tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/finetune/iql/antmaze/medium_diverse_v2.yaml b/configs/finetune/iql/antmaze/medium_diverse_v2.yaml new file mode 100644 index 00000000..b92202f9 --- /dev/null +++ b/configs/finetune/iql/antmaze/medium_diverse_v2.yaml @@ -0,0 +1,24 @@ +actor_lr: 3e-4 +batch_size: 256 +beta: 10.0 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: antmaze-medium-diverse-v2 +eval_freq: 50000 +group: IQL-D4RL +iql_deterministic: false +iql_tau: 0.9 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +n_episodes: 100 +name: IQL_antmaze-medium-diverse-v2 +normalize: true +normalize_reward: true +qf_lr: 3e-4 +project: CORL +seed: 0 +tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/finetune/iql/antmaze/medium_play_v2.yaml b/configs/finetune/iql/antmaze/medium_play_v2.yaml new file mode 100644 index 00000000..cd1db48c --- /dev/null +++ b/configs/finetune/iql/antmaze/medium_play_v2.yaml @@ -0,0 +1,24 @@ +actor_lr: 3e-4 +batch_size: 256 +beta: 10.0 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: antmaze-medium-play-v2 +eval_freq: 50000 +group: IQL-D4RL +iql_deterministic: false +iql_tau: 0.9 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +n_episodes: 100 +name: IQL_antmaze-medium-play-v2 +normalize: true +normalize_reward: true +qf_lr: 3e-4 +project: CORL +seed: 0 +tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/finetune/iql/antmaze/umaze_diverse_v2.yaml b/configs/finetune/iql/antmaze/umaze_diverse_v2.yaml new file mode 100644 index 00000000..440fe453 --- /dev/null +++ b/configs/finetune/iql/antmaze/umaze_diverse_v2.yaml @@ -0,0 +1,24 @@ +actor_lr: 3e-4 +batch_size: 256 +beta: 10.0 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: antmaze-umaze-diverse-v2 +eval_freq: 50000 +group: IQL-D4RL +iql_deterministic: false +iql_tau: 0.9 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +n_episodes: 100 +name: IQL_antmaze-umaze-diverse-v2 +normalize: true +normalize_reward: true +qf_lr: 3e-4 +project: CORL +seed: 0 +tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/finetune/iql/antmaze/umaze_v2.yaml b/configs/finetune/iql/antmaze/umaze_v2.yaml new file mode 100644 index 00000000..aea0060c --- /dev/null +++ b/configs/finetune/iql/antmaze/umaze_v2.yaml @@ -0,0 +1,24 @@ +actor_lr: 3e-4 +batch_size: 256 +beta: 10.0 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: antmaze-umaze-v2 +eval_freq: 50000 +group: IQL-D4RL +iql_deterministic: false +iql_tau: 0.9 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +n_episodes: 100 +name: IQL_antmaze-umaze-v2 +normalize: true +normalize_reward: true +qf_lr: 3e-4 +project: CORL +seed: 0 +tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/finetune/iql/door/cloned_v1.yaml b/configs/finetune/iql/door/cloned_v1.yaml new file mode 100644 index 00000000..6487c192 --- /dev/null +++ b/configs/finetune/iql/door/cloned_v1.yaml @@ -0,0 +1,25 @@ +actor_lr: 3e-4 +actor_dropout: 0.1 +batch_size: 256 +beta: 3.0 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: door-cloned-v1 +eval_freq: 5000 +group: IQL-D4RL +iql_deterministic: false +iql_tau: 0.8 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +n_episodes: 10 +name: IQL_door-cloned-v1 +normalize: true +normalize_reward: false +qf_lr: 3e-4 +project: CORL +seed: 0 +tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/finetune/iql/hammer/cloned_v1.yaml b/configs/finetune/iql/hammer/cloned_v1.yaml new file mode 100644 index 00000000..2ce00651 --- /dev/null +++ b/configs/finetune/iql/hammer/cloned_v1.yaml @@ -0,0 +1,25 @@ +actor_lr: 3e-4 +actor_dropout: 0.1 +batch_size: 256 +beta: 3.0 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: hammer-cloned-v1 +eval_freq: 5000 +group: IQL-D4RL +iql_deterministic: false +iql_tau: 0.8 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +n_episodes: 10 +name: IQL_hammer-cloned-v1 +normalize: true +normalize_reward: false +qf_lr: 3e-4 +project: CORL +seed: 0 +tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/finetune/iql/pen/cloned_v1.yaml b/configs/finetune/iql/pen/cloned_v1.yaml new file mode 100644 index 00000000..30374a3c --- /dev/null +++ b/configs/finetune/iql/pen/cloned_v1.yaml @@ -0,0 +1,25 @@ +actor_lr: 3e-4 +actor_dropout: 0.1 +batch_size: 256 +beta: 3.0 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: pen-cloned-v1 +eval_freq: 5000 +group: IQL-D4RL +iql_deterministic: false +iql_tau: 0.8 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +n_episodes: 10 +name: IQL_pen-cloned-v1 +normalize: true +normalize_reward: false +qf_lr: 3e-4 +project: CORL +seed: 0 +tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/finetune/iql/relocate/cloned_v1.yaml b/configs/finetune/iql/relocate/cloned_v1.yaml new file mode 100644 index 00000000..83212d52 --- /dev/null +++ b/configs/finetune/iql/relocate/cloned_v1.yaml @@ -0,0 +1,25 @@ +actor_lr: 3e-4 +actor_dropout: 0.1 +batch_size: 256 +beta: 3.0 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: relocate-cloned-v1 +eval_freq: 5000 +group: IQL-D4RL +iql_deterministic: false +iql_tau: 0.8 +load_model: '' +offline_iterations: 1000000 +online_iterations: 1000000 +n_episodes: 10 +name: IQL_relocate-cloned-v1 +normalize: true +normalize_reward: false +qf_lr: 3e-4 +project: CORL +seed: 0 +tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/finetune/spot/antmaze/large_diverse_v2.yaml b/configs/finetune/spot/antmaze/large_diverse_v2.yaml new file mode 100644 index 00000000..d7c7fda0 --- /dev/null +++ b/configs/finetune/spot/antmaze/large_diverse_v2.yaml @@ -0,0 +1,38 @@ +actor_init_w: 0.001 +actor_lr: 0.0001 +batch_size: 256 +beta: 0.5 +buffer_size: 2000000 +checkpoints_path: null +critic_init_w: 0.003 +critic_lr: 0.0003 +device: cuda +discount: 0.99 +env: antmaze-large-diverse-v2 +eval_freq: 50000 +eval_seed: 0 +expl_noise: 0.1 +group: SPOT-D4RL +iwae: false +lambd: 0.025 +lambd_cool: true +lambd_end: 0.2 +load_model: '' +n_episodes: 100 +name: SPOT +noise_clip: 0.5 +normalize: false +normalize_reward: true +num_samples: 1 +offline_iterations: 1000000 +online_discount: 0.995 +online_iterations: 1000000 +policy_freq: 2 +policy_noise: 0.2 +project: CORL +seed: 0 +tau: 0.005 +vae_hidden_dim: 750 +vae_iterations: 100000 +vae_latent_dim: null +vae_lr: 0.001 diff --git a/configs/finetune/spot/antmaze/large_play_v2.yaml b/configs/finetune/spot/antmaze/large_play_v2.yaml new file mode 100644 index 00000000..2f9134c3 --- /dev/null +++ b/configs/finetune/spot/antmaze/large_play_v2.yaml @@ -0,0 +1,38 @@ +actor_init_w: 0.001 +actor_lr: 0.0001 +batch_size: 256 +beta: 0.5 +buffer_size: 2000000 +checkpoints_path: null +critic_init_w: 0.003 +critic_lr: 0.0003 +device: cuda +discount: 0.99 +env: antmaze-large-play-v2 +eval_freq: 50000 +eval_seed: 0 +expl_noise: 0.1 +group: SPOT-D4RL +iwae: false +lambd: 0.025 +lambd_cool: true +lambd_end: 0.2 +load_model: '' +n_episodes: 100 +name: SPOT +noise_clip: 0.5 +normalize: false +normalize_reward: true +num_samples: 1 +offline_iterations: 1000000 +online_discount: 0.995 +online_iterations: 1000000 +policy_freq: 2 +policy_noise: 0.2 +project: CORL +seed: 0 +tau: 0.005 +vae_hidden_dim: 750 +vae_iterations: 100000 +vae_latent_dim: null +vae_lr: 0.001 diff --git a/configs/finetune/spot/antmaze/medium_diverse_v2.yaml b/configs/finetune/spot/antmaze/medium_diverse_v2.yaml new file mode 100644 index 00000000..8befd6f6 --- /dev/null +++ b/configs/finetune/spot/antmaze/medium_diverse_v2.yaml @@ -0,0 +1,38 @@ +actor_init_w: 0.001 +actor_lr: 0.0001 +batch_size: 256 +beta: 0.5 +buffer_size: 2000000 +checkpoints_path: null +critic_init_w: 0.003 +critic_lr: 0.0003 +device: cuda +discount: 0.99 +env: antmaze-medium-diverse-v2 +eval_freq: 50000 +eval_seed: 0 +expl_noise: 0.1 +group: SPOT-D4RL +iwae: false +lambd: 0.025 +lambd_cool: true +lambd_end: 0.2 +load_model: '' +n_episodes: 100 +name: SPOT +noise_clip: 0.5 +normalize: false +normalize_reward: true +num_samples: 1 +offline_iterations: 1000000 +online_discount: 0.995 +online_iterations: 1000000 +policy_freq: 2 +policy_noise: 0.2 +project: CORL +seed: 0 +tau: 0.005 +vae_hidden_dim: 750 +vae_iterations: 100000 +vae_latent_dim: null +vae_lr: 0.001 diff --git a/configs/finetune/spot/antmaze/medium_play_v2.yaml b/configs/finetune/spot/antmaze/medium_play_v2.yaml new file mode 100644 index 00000000..59c42661 --- /dev/null +++ b/configs/finetune/spot/antmaze/medium_play_v2.yaml @@ -0,0 +1,38 @@ +actor_init_w: 0.001 +actor_lr: 0.0001 +batch_size: 256 +beta: 0.5 +buffer_size: 2000000 +checkpoints_path: null +critic_init_w: 0.003 +critic_lr: 0.0003 +device: cuda +discount: 0.99 +env: antmaze-medium-play-v2 +eval_freq: 50000 +eval_seed: 0 +expl_noise: 0.1 +group: SPOT-D4RL +iwae: false +lambd: 0.05 +lambd_cool: true +lambd_end: 0.2 +load_model: '' +n_episodes: 100 +name: SPOT +noise_clip: 0.5 +normalize: false +normalize_reward: true +num_samples: 1 +offline_iterations: 1000000 +online_discount: 0.995 +online_iterations: 1000000 +policy_freq: 2 +policy_noise: 0.2 +project: CORL +seed: 0 +tau: 0.005 +vae_hidden_dim: 750 +vae_iterations: 100000 +vae_latent_dim: null +vae_lr: 0.001 diff --git a/configs/finetune/spot/antmaze/umaze_diverse_v2.yaml b/configs/finetune/spot/antmaze/umaze_diverse_v2.yaml new file mode 100644 index 00000000..7c4cf56b --- /dev/null +++ b/configs/finetune/spot/antmaze/umaze_diverse_v2.yaml @@ -0,0 +1,38 @@ +actor_init_w: 0.001 +actor_lr: 0.0001 +batch_size: 256 +beta: 0.5 +buffer_size: 2000000 +checkpoints_path: null +critic_init_w: 0.003 +critic_lr: 0.0003 +device: cuda +discount: 0.99 +env: antmaze-umaze-diverse-v2 +eval_freq: 50000 +eval_seed: 0 +expl_noise: 0.1 +group: SPOT-D4RL +iwae: false +lambd: 0.25 +lambd_cool: true +lambd_end: 0.2 +load_model: '' +n_episodes: 100 +name: SPOT +noise_clip: 0.5 +normalize: false +normalize_reward: true +num_samples: 1 +offline_iterations: 1000000 +online_discount: 0.995 +online_iterations: 1000000 +policy_freq: 2 +policy_noise: 0.2 +project: CORL +seed: 0 +tau: 0.005 +vae_hidden_dim: 750 +vae_iterations: 100000 +vae_latent_dim: null +vae_lr: 0.001 diff --git a/configs/finetune/spot/antmaze/umaze_v2.yaml b/configs/finetune/spot/antmaze/umaze_v2.yaml new file mode 100644 index 00000000..1a2672ba --- /dev/null +++ b/configs/finetune/spot/antmaze/umaze_v2.yaml @@ -0,0 +1,38 @@ +actor_init_w: 0.001 +actor_lr: 0.0001 +batch_size: 256 +beta: 0.5 +buffer_size: 2000000 +checkpoints_path: null +critic_init_w: 0.003 +critic_lr: 0.0003 +device: cuda +discount: 0.99 +env: antmaze-umaze-v2 +eval_freq: 50000 +eval_seed: 0 +expl_noise: 0.1 +group: SPOT-D4RL +iwae: false +lambd: 0.25 +lambd_cool: true +lambd_end: 0.2 +load_model: '' +n_episodes: 100 +name: SPOT +noise_clip: 0.5 +normalize: false +normalize_reward: true +num_samples: 1 +offline_iterations: 1000000 +online_discount: 0.995 +online_iterations: 1000000 +policy_freq: 2 +policy_noise: 0.2 +project: CORL +seed: 0 +tau: 0.005 +vae_hidden_dim: 750 +vae_iterations: 100000 +vae_latent_dim: null +vae_lr: 0.001 diff --git a/configs/finetune/spot/door/cloned_v1.yaml b/configs/finetune/spot/door/cloned_v1.yaml new file mode 100644 index 00000000..ee31cefb --- /dev/null +++ b/configs/finetune/spot/door/cloned_v1.yaml @@ -0,0 +1,38 @@ +actor_init_w: 0.001 +actor_lr: 0.0001 +batch_size: 256 +beta: 0.5 +buffer_size: 2000000 +checkpoints_path: null +critic_init_w: 0.003 +critic_lr: 0.0003 +device: cuda +discount: 0.99 +env: door-cloned-v1 +eval_freq: 5000 +eval_seed: 0 +expl_noise: 0.1 +group: SPOT-D4RL +iwae: false +lambd: 1.0 +lambd_cool: true +lambd_end: 0.5 +load_model: '' +n_episodes: 10 +name: SPOT +noise_clip: 0.5 +normalize: false +normalize_reward: false +num_samples: 1 +offline_iterations: 1000000 +online_discount: 0.99 +online_iterations: 1000000 +policy_freq: 2 +policy_noise: 0.2 +project: CORL +seed: 0 +tau: 0.005 +vae_hidden_dim: 750 +vae_iterations: 100000 +vae_latent_dim: null +vae_lr: 0.001 diff --git a/configs/finetune/spot/hammer/cloned_v1.yaml b/configs/finetune/spot/hammer/cloned_v1.yaml new file mode 100644 index 00000000..46baa763 --- /dev/null +++ b/configs/finetune/spot/hammer/cloned_v1.yaml @@ -0,0 +1,38 @@ +actor_init_w: 0.001 +actor_lr: 0.0001 +batch_size: 256 +beta: 0.5 +buffer_size: 2000000 +checkpoints_path: null +critic_init_w: 0.003 +critic_lr: 0.0003 +device: cuda +discount: 0.99 +env: hammer-cloned-v1 +eval_freq: 5000 +eval_seed: 0 +expl_noise: 0.1 +group: SPOT-D4RL +iwae: false +lambd: 1.0 +lambd_cool: true +lambd_end: 0.5 +load_model: '' +n_episodes: 10 +name: SPOT +noise_clip: 0.5 +normalize: false +normalize_reward: false +num_samples: 1 +offline_iterations: 1000000 +online_discount: 0.99 +online_iterations: 1000000 +policy_freq: 2 +policy_noise: 0.2 +project: CORL +seed: 0 +tau: 0.005 +vae_hidden_dim: 750 +vae_iterations: 100000 +vae_latent_dim: null +vae_lr: 0.001 diff --git a/configs/finetune/spot/pen/cloned_v1.yaml b/configs/finetune/spot/pen/cloned_v1.yaml new file mode 100644 index 00000000..177b073e --- /dev/null +++ b/configs/finetune/spot/pen/cloned_v1.yaml @@ -0,0 +1,38 @@ +actor_init_w: 0.001 +actor_lr: 0.0001 +batch_size: 256 +beta: 0.5 +buffer_size: 2000000 +checkpoints_path: null +critic_init_w: 0.003 +critic_lr: 0.0003 +device: cuda +discount: 0.99 +env: pen-cloned-v1 +eval_freq: 5000 +eval_seed: 0 +expl_noise: 0.1 +group: SPOT-D4RL +iwae: false +lambd: 1.0 +lambd_cool: true +lambd_end: 0.5 +load_model: '' +n_episodes: 10 +name: SPOT +noise_clip: 0.5 +normalize: false +normalize_reward: false +num_samples: 1 +offline_iterations: 1000000 +online_discount: 0.99 +online_iterations: 1000000 +policy_freq: 2 +policy_noise: 0.2 +project: CORL +seed: 0 +tau: 0.005 +vae_hidden_dim: 750 +vae_iterations: 100000 +vae_latent_dim: null +vae_lr: 0.001 diff --git a/configs/finetune/spot/relocate/cloned_v1.yaml b/configs/finetune/spot/relocate/cloned_v1.yaml new file mode 100644 index 00000000..98f4890e --- /dev/null +++ b/configs/finetune/spot/relocate/cloned_v1.yaml @@ -0,0 +1,38 @@ +actor_init_w: 0.001 +actor_lr: 0.0001 +batch_size: 256 +beta: 0.5 +buffer_size: 2000000 +checkpoints_path: null +critic_init_w: 0.003 +critic_lr: 0.0003 +device: cuda +discount: 0.99 +env: relocate-cloned-v1 +eval_freq: 5000 +eval_seed: 0 +expl_noise: 0.1 +group: SPOT-D4RL +iwae: false +lambd: 1.0 +lambd_cool: true +lambd_end: 0.5 +load_model: '' +n_episodes: 10 +name: SPOT +noise_clip: 0.5 +normalize: false +normalize_reward: false +num_samples: 1 +offline_iterations: 1000000 +online_discount: 0.99 +online_iterations: 1000000 +policy_freq: 2 +policy_noise: 0.2 +project: CORL +seed: 0 +tau: 0.005 +vae_hidden_dim: 750 +vae_iterations: 100000 +vae_latent_dim: null +vae_lr: 0.001 diff --git a/configs/offline/awac/antmaze/large_diverse_v2.yaml b/configs/offline/awac/antmaze/large_diverse_v2.yaml new file mode 100644 index 00000000..6d9308f0 --- /dev/null +++ b/configs/offline/awac/antmaze/large_diverse_v2.yaml @@ -0,0 +1,19 @@ +awac_lambda: 0.1 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +deterministic_torch: false +device: cuda +env_name: antmaze-large-diverse-v2 +eval_frequency: 1000 +gamma: 0.99 +group: awac-antmaze-large-diverse-v2-multiseed-v0 +hidden_dim: 256 +learning_rate: 0.0003 +n_test_episodes: 100 +normalize_reward: true +num_train_ops: 1000000 +project: CORL +seed: 42 +tau: 0.005 +test_seed: 69 diff --git a/configs/offline/awac/antmaze/large_play_v0.yaml b/configs/offline/awac/antmaze/large_play_v2.yaml similarity index 72% rename from configs/offline/awac/antmaze/large_play_v0.yaml rename to configs/offline/awac/antmaze/large_play_v2.yaml index aa9546e4..cbe95e54 100644 --- a/configs/offline/awac/antmaze/large_play_v0.yaml +++ b/configs/offline/awac/antmaze/large_play_v2.yaml @@ -2,12 +2,12 @@ awac_lambda: 0.1 batch_size: 256 buffer_size: 10000000 checkpoints_path: null -deterministic_torch: true +deterministic_torch: false device: cuda -env_name: antmaze-large-play-v0 +env_name: antmaze-large-play-v2 eval_frequency: 1000 gamma: 0.99 -group: awac-antmaze-large-play-v0-multiseed-v0 +group: awac-antmaze-large-play-v2-multiseed-v0 hidden_dim: 256 learning_rate: 0.0003 n_test_episodes: 100 diff --git a/configs/offline/awac/antmaze/medium_diverse_v2.yaml b/configs/offline/awac/antmaze/medium_diverse_v2.yaml new file mode 100644 index 00000000..3abde369 --- /dev/null +++ b/configs/offline/awac/antmaze/medium_diverse_v2.yaml @@ -0,0 +1,19 @@ +awac_lambda: 0.1 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +deterministic_torch: false +device: cuda +env_name: antmaze-medium-diverse-v2 +eval_frequency: 1000 +gamma: 0.99 +group: awac-antmaze-medium-diverse-v2-multiseed-v0 +hidden_dim: 256 +learning_rate: 0.0003 +n_test_episodes: 100 +normalize_reward: true +num_train_ops: 1000000 +project: CORL +seed: 42 +tau: 0.005 +test_seed: 69 diff --git a/configs/offline/awac/antmaze/medium_play_v0.yaml b/configs/offline/awac/antmaze/medium_play_v2.yaml similarity index 71% rename from configs/offline/awac/antmaze/medium_play_v0.yaml rename to configs/offline/awac/antmaze/medium_play_v2.yaml index 5868a658..a31e05bc 100644 --- a/configs/offline/awac/antmaze/medium_play_v0.yaml +++ b/configs/offline/awac/antmaze/medium_play_v2.yaml @@ -2,12 +2,12 @@ awac_lambda: 0.1 batch_size: 256 buffer_size: 10000000 checkpoints_path: null -deterministic_torch: true +deterministic_torch: false device: cuda -env_name: antmaze-medium-play-v0 +env_name: antmaze-medium-play-v2 eval_frequency: 1000 gamma: 0.99 -group: awac-antmaze-medium-play-v0-multiseed-v0 +group: awac-antmaze-medium-play-v2-multiseed-v0 hidden_dim: 256 learning_rate: 0.0003 n_test_episodes: 100 diff --git a/configs/offline/awac/antmaze/umaze_diverse_v2.yaml b/configs/offline/awac/antmaze/umaze_diverse_v2.yaml new file mode 100644 index 00000000..35f6fdd6 --- /dev/null +++ b/configs/offline/awac/antmaze/umaze_diverse_v2.yaml @@ -0,0 +1,19 @@ +awac_lambda: 0.1 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +deterministic_torch: false +device: cuda +env_name: antmaze-umaze-diverse-v2 +eval_frequency: 1000 +gamma: 0.99 +group: awac-antmaze-umaze-diverse-v2-multiseed-v0 +hidden_dim: 256 +learning_rate: 0.0003 +n_test_episodes: 100 +normalize_reward: true +num_train_ops: 1000000 +project: CORL +seed: 42 +tau: 0.005 +test_seed: 69 diff --git a/configs/offline/awac/antmaze/umaze_v0.yaml b/configs/offline/awac/antmaze/umaze_v2.yaml similarity index 74% rename from configs/offline/awac/antmaze/umaze_v0.yaml rename to configs/offline/awac/antmaze/umaze_v2.yaml index b164b0b9..7a2f1a83 100644 --- a/configs/offline/awac/antmaze/umaze_v0.yaml +++ b/configs/offline/awac/antmaze/umaze_v2.yaml @@ -2,12 +2,12 @@ awac_lambda: 0.1 batch_size: 256 buffer_size: 10000000 checkpoints_path: null -deterministic_torch: true +deterministic_torch: false device: cuda -env_name: antmaze-umaze-v0 +env_name: antmaze-umaze-v2 eval_frequency: 1000 gamma: 0.99 -group: awac-antmaze-umaze-v0-multiseed-v0 +group: awac-antmaze-umaze-v2-multiseed-v0 hidden_dim: 256 learning_rate: 0.0003 n_test_episodes: 100 diff --git a/configs/offline/awac/door/cloned_v1.yaml b/configs/offline/awac/door/cloned_v1.yaml new file mode 100644 index 00000000..5d58fd59 --- /dev/null +++ b/configs/offline/awac/door/cloned_v1.yaml @@ -0,0 +1,18 @@ +awac_lambda: 0.1 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +deterministic_torch: false +device: cuda +env_name: door-cloned-v1 +eval_frequency: 1000 +gamma: 0.99 +group: awac-door-cloned-v1-multiseed-v0 +hidden_dim: 256 +learning_rate: 0.0003 +n_test_episodes: 10 +num_train_ops: 1000000 +project: CORL +seed: 42 +tau: 0.005 +test_seed: 69 diff --git a/configs/offline/awac/door/expert_v1.yaml b/configs/offline/awac/door/expert_v1.yaml new file mode 100644 index 00000000..5e5dc0ca --- /dev/null +++ b/configs/offline/awac/door/expert_v1.yaml @@ -0,0 +1,18 @@ +awac_lambda: 0.1 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +deterministic_torch: false +device: cuda +env_name: door-expert-v1 +eval_frequency: 1000 +gamma: 0.99 +group: awac-door-expert-v1-multiseed-v0 +hidden_dim: 256 +learning_rate: 0.0003 +n_test_episodes: 10 +num_train_ops: 1000000 +project: CORL +seed: 42 +tau: 0.005 +test_seed: 69 diff --git a/configs/offline/awac/door/human_v1.yaml b/configs/offline/awac/door/human_v1.yaml new file mode 100644 index 00000000..ee9876c8 --- /dev/null +++ b/configs/offline/awac/door/human_v1.yaml @@ -0,0 +1,18 @@ +awac_lambda: 0.1 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +deterministic_torch: false +device: cuda +env_name: door-human-v1 +eval_frequency: 1000 +gamma: 0.99 +group: awac-door-human-v1-multiseed-v0 +hidden_dim: 256 +learning_rate: 0.0003 +n_test_episodes: 10 +num_train_ops: 1000000 +project: CORL +seed: 42 +tau: 0.005 +test_seed: 69 diff --git a/configs/offline/awac/hammer/cloned_v1.yaml b/configs/offline/awac/hammer/cloned_v1.yaml new file mode 100644 index 00000000..2b457546 --- /dev/null +++ b/configs/offline/awac/hammer/cloned_v1.yaml @@ -0,0 +1,18 @@ +awac_lambda: 0.1 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +deterministic_torch: false +device: cuda +env_name: hammer-cloned-v1 +eval_frequency: 1000 +gamma: 0.99 +group: awac-hammer-cloned-v1-multiseed-v0 +hidden_dim: 256 +learning_rate: 0.0003 +n_test_episodes: 10 +num_train_ops: 1000000 +project: CORL +seed: 42 +tau: 0.005 +test_seed: 69 diff --git a/configs/offline/awac/hammer/expert_v1.yaml b/configs/offline/awac/hammer/expert_v1.yaml new file mode 100644 index 00000000..67dea1d9 --- /dev/null +++ b/configs/offline/awac/hammer/expert_v1.yaml @@ -0,0 +1,18 @@ +awac_lambda: 0.1 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +deterministic_torch: false +device: cuda +env_name: hammer-expert-v1 +eval_frequency: 1000 +gamma: 0.99 +group: awac-hammer-expert-v1-multiseed-v0 +hidden_dim: 256 +learning_rate: 0.0003 +n_test_episodes: 10 +num_train_ops: 1000000 +project: CORL +seed: 42 +tau: 0.005 +test_seed: 69 diff --git a/configs/offline/awac/hammer/human_v1.yaml b/configs/offline/awac/hammer/human_v1.yaml new file mode 100644 index 00000000..d85de982 --- /dev/null +++ b/configs/offline/awac/hammer/human_v1.yaml @@ -0,0 +1,18 @@ +awac_lambda: 0.1 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +deterministic_torch: false +device: cuda +env_name: hammer-human-v1 +eval_frequency: 1000 +gamma: 0.99 +group: awac-hammer-human-v1-multiseed-v0 +hidden_dim: 256 +learning_rate: 0.0003 +n_test_episodes: 10 +num_train_ops: 1000000 +project: CORL +seed: 42 +tau: 0.005 +test_seed: 69 diff --git a/configs/offline/awac/pen/cloned_v1.yaml b/configs/offline/awac/pen/cloned_v1.yaml new file mode 100644 index 00000000..0e85799d --- /dev/null +++ b/configs/offline/awac/pen/cloned_v1.yaml @@ -0,0 +1,18 @@ +awac_lambda: 0.1 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +deterministic_torch: false +device: cuda +env_name: pen-cloned-v1 +eval_frequency: 1000 +gamma: 0.99 +group: awac-pen-cloned-v1-multiseed-v0 +hidden_dim: 256 +learning_rate: 0.0003 +n_test_episodes: 10 +num_train_ops: 1000000 +project: CORL +seed: 42 +tau: 0.005 +test_seed: 69 diff --git a/configs/offline/awac/pen/expert_v1.yaml b/configs/offline/awac/pen/expert_v1.yaml new file mode 100644 index 00000000..535586e4 --- /dev/null +++ b/configs/offline/awac/pen/expert_v1.yaml @@ -0,0 +1,18 @@ +awac_lambda: 0.1 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +deterministic_torch: false +device: cuda +env_name: pen-expert-v1 +eval_frequency: 1000 +gamma: 0.99 +group: awac-pen-expert-v1-multiseed-v0 +hidden_dim: 256 +learning_rate: 0.0003 +n_test_episodes: 10 +num_train_ops: 1000000 +project: CORL +seed: 42 +tau: 0.005 +test_seed: 69 diff --git a/configs/offline/awac/pen/human_v1.yaml b/configs/offline/awac/pen/human_v1.yaml new file mode 100644 index 00000000..27d84d32 --- /dev/null +++ b/configs/offline/awac/pen/human_v1.yaml @@ -0,0 +1,18 @@ +awac_lambda: 0.1 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +deterministic_torch: false +device: cuda +env_name: pen-human-v1 +eval_frequency: 1000 +gamma: 0.99 +group: awac-pen-human-v1-multiseed-v0 +hidden_dim: 256 +learning_rate: 0.0003 +n_test_episodes: 10 +num_train_ops: 1000000 +project: CORL +seed: 42 +tau: 0.005 +test_seed: 69 diff --git a/configs/offline/awac/relocate/cloned_v1.yaml b/configs/offline/awac/relocate/cloned_v1.yaml new file mode 100644 index 00000000..632a1d96 --- /dev/null +++ b/configs/offline/awac/relocate/cloned_v1.yaml @@ -0,0 +1,18 @@ +awac_lambda: 0.1 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +deterministic_torch: false +device: cuda +env_name: relocate-cloned-v1 +eval_frequency: 1000 +gamma: 0.99 +group: awac-relocate-cloned-v1-multiseed-v0 +hidden_dim: 256 +learning_rate: 0.0003 +n_test_episodes: 10 +num_train_ops: 1000000 +project: CORL +seed: 42 +tau: 0.005 +test_seed: 69 diff --git a/configs/offline/awac/relocate/expert_v1.yaml b/configs/offline/awac/relocate/expert_v1.yaml new file mode 100644 index 00000000..100fe20f --- /dev/null +++ b/configs/offline/awac/relocate/expert_v1.yaml @@ -0,0 +1,18 @@ +awac_lambda: 0.1 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +deterministic_torch: false +device: cuda +env_name: relocate-expert-v1 +eval_frequency: 1000 +gamma: 0.99 +group: awac-relocate-expert-v1-multiseed-v0 +hidden_dim: 256 +learning_rate: 0.0003 +n_test_episodes: 10 +num_train_ops: 1000000 +project: CORL +seed: 42 +tau: 0.005 +test_seed: 69 diff --git a/configs/offline/awac/relocate/human_v1.yaml b/configs/offline/awac/relocate/human_v1.yaml new file mode 100644 index 00000000..501cd318 --- /dev/null +++ b/configs/offline/awac/relocate/human_v1.yaml @@ -0,0 +1,18 @@ +awac_lambda: 0.1 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +deterministic_torch: false +device: cuda +env_name: relocate-human-v1 +eval_frequency: 1000 +gamma: 0.99 +group: awac-relocate-human-v1-multiseed-v0 +hidden_dim: 256 +learning_rate: 0.0003 +n_test_episodes: 10 +num_train_ops: 1000000 +project: CORL +seed: 42 +tau: 0.005 +test_seed: 69 diff --git a/configs/offline/bc/antmaze/large_diverse_v2.yaml b/configs/offline/bc/antmaze/large_diverse_v2.yaml new file mode 100644 index 00000000..c719679f --- /dev/null +++ b/configs/offline/bc/antmaze/large_diverse_v2.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: antmaze-large-diverse-v2 +eval_freq: 5000 +frac: 1.0 +group: bc-antmaze-large-diverse-v2-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 100 +name: BC +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc/antmaze/large_play_v0.yaml b/configs/offline/bc/antmaze/large_play_v2.yaml similarity index 76% rename from configs/offline/bc/antmaze/large_play_v0.yaml rename to configs/offline/bc/antmaze/large_play_v2.yaml index f9f93623..60cb8988 100644 --- a/configs/offline/bc/antmaze/large_play_v0.yaml +++ b/configs/offline/bc/antmaze/large_play_v2.yaml @@ -3,10 +3,10 @@ buffer_size: 10000000 checkpoints_path: null device: cuda discount: 0.99 -env: antmaze-large-play-v0 +env: antmaze-large-play-v2 eval_freq: 5000 frac: 1.0 -group: bc-antmaze-large-play-v0-multiseed-v0 +group: bc-antmaze-large-play-v2-multiseed-v0 load_model: '' max_timesteps: 1000000 max_traj_len: 1000 diff --git a/configs/offline/bc/antmaze/medium_diverse_v2.yaml b/configs/offline/bc/antmaze/medium_diverse_v2.yaml new file mode 100644 index 00000000..10524acc --- /dev/null +++ b/configs/offline/bc/antmaze/medium_diverse_v2.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: antmaze-medium-diverse-v2 +eval_freq: 5000 +frac: 1.0 +group: bc-antmaze-medium-diverse-v2-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 100 +name: BC +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc/antmaze/medium_play_v0.yaml b/configs/offline/bc/antmaze/medium_play_v2.yaml similarity index 76% rename from configs/offline/bc/antmaze/medium_play_v0.yaml rename to configs/offline/bc/antmaze/medium_play_v2.yaml index 25f38bc6..cb713c08 100644 --- a/configs/offline/bc/antmaze/medium_play_v0.yaml +++ b/configs/offline/bc/antmaze/medium_play_v2.yaml @@ -3,10 +3,10 @@ buffer_size: 10000000 checkpoints_path: null device: cuda discount: 0.99 -env: antmaze-medium-play-v0 +env: antmaze-medium-play-v2 eval_freq: 5000 frac: 1.0 -group: bc-antmaze-medium-play-v0-multiseed-v0 +group: bc-antmaze-medium-play-v2-multiseed-v0 load_model: '' max_timesteps: 1000000 max_traj_len: 1000 diff --git a/configs/offline/bc/antmaze/umaze_diverse_v2.yaml b/configs/offline/bc/antmaze/umaze_diverse_v2.yaml new file mode 100644 index 00000000..51874e77 --- /dev/null +++ b/configs/offline/bc/antmaze/umaze_diverse_v2.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: antmaze-umaze-diverse-v2 +eval_freq: 5000 +frac: 1.0 +group: bc-antmaze-umaze-diverse-v2-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 100 +name: BC +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc/antmaze/umaze_v0.yaml b/configs/offline/bc/antmaze/umaze_v2.yaml similarity index 79% rename from configs/offline/bc/antmaze/umaze_v0.yaml rename to configs/offline/bc/antmaze/umaze_v2.yaml index 49f9c3e8..10fb284e 100644 --- a/configs/offline/bc/antmaze/umaze_v0.yaml +++ b/configs/offline/bc/antmaze/umaze_v2.yaml @@ -3,10 +3,10 @@ buffer_size: 10000000 checkpoints_path: null device: cuda discount: 0.99 -env: antmaze-umaze-v0 +env: antmaze-umaze-v2 eval_freq: 5000 frac: 1.0 -group: bc-antmaze-umaze-v0-multiseed-v0 +group: bc-antmaze-umaze-v2-multiseed-v0 load_model: '' max_timesteps: 1000000 max_traj_len: 1000 diff --git a/configs/offline/bc/door/cloned_v1.yaml b/configs/offline/bc/door/cloned_v1.yaml new file mode 100644 index 00000000..80afa0b5 --- /dev/null +++ b/configs/offline/bc/door/cloned_v1.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: door-cloned-v1 +eval_freq: 5000 +frac: 1.0 +group: bc-door-cloned-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 10 +name: BC +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc/door/expert_v1.yaml b/configs/offline/bc/door/expert_v1.yaml new file mode 100644 index 00000000..5b39c845 --- /dev/null +++ b/configs/offline/bc/door/expert_v1.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: door-expert-v1 +eval_freq: 5000 +frac: 1.0 +group: bc-door-expert-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 10 +name: BC +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc/door/human_v1.yaml b/configs/offline/bc/door/human_v1.yaml new file mode 100644 index 00000000..3486d168 --- /dev/null +++ b/configs/offline/bc/door/human_v1.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: door-human-v1 +eval_freq: 5000 +frac: 1.0 +group: bc-door-human-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 10 +name: BC +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc/hammer/cloned_v1.yaml b/configs/offline/bc/hammer/cloned_v1.yaml new file mode 100644 index 00000000..e941833d --- /dev/null +++ b/configs/offline/bc/hammer/cloned_v1.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: hammer-cloned-v1 +eval_freq: 5000 +frac: 1.0 +group: bc-hammer-cloned-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 10 +name: BC +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc/hammer/expert_v1.yaml b/configs/offline/bc/hammer/expert_v1.yaml new file mode 100644 index 00000000..34b16ac3 --- /dev/null +++ b/configs/offline/bc/hammer/expert_v1.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: hammer-expert-v1 +eval_freq: 5000 +frac: 1.0 +group: bc-hammer-expert-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 10 +name: BC +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc/hammer/human_v1.yaml b/configs/offline/bc/hammer/human_v1.yaml new file mode 100644 index 00000000..44781db4 --- /dev/null +++ b/configs/offline/bc/hammer/human_v1.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: hammer-human-v1 +eval_freq: 5000 +frac: 1.0 +group: bc-hammer-human-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 10 +name: BC +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc/pen/cloned_v1.yaml b/configs/offline/bc/pen/cloned_v1.yaml new file mode 100644 index 00000000..3999c70e --- /dev/null +++ b/configs/offline/bc/pen/cloned_v1.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: pen-cloned-v1 +eval_freq: 5000 +frac: 1.0 +group: bc-pen-cloned-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 10 +name: BC +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc/pen/expert_v1.yaml b/configs/offline/bc/pen/expert_v1.yaml new file mode 100644 index 00000000..7a1def38 --- /dev/null +++ b/configs/offline/bc/pen/expert_v1.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: pen-expert-v1 +eval_freq: 5000 +frac: 1.0 +group: bc-pen-expert-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 10 +name: BC +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc/pen/human_v1.yaml b/configs/offline/bc/pen/human_v1.yaml new file mode 100644 index 00000000..d260eaff --- /dev/null +++ b/configs/offline/bc/pen/human_v1.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: pen-human-v1 +eval_freq: 5000 +frac: 1.0 +group: bc-pen-human-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 10 +name: BC +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc/relocate/cloned_v1.yaml b/configs/offline/bc/relocate/cloned_v1.yaml new file mode 100644 index 00000000..b82f934a --- /dev/null +++ b/configs/offline/bc/relocate/cloned_v1.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: relocate-cloned-v1 +eval_freq: 5000 +frac: 1.0 +group: bc-relocate-cloned-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 10 +name: BC +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc/relocate/expert_v1.yaml b/configs/offline/bc/relocate/expert_v1.yaml new file mode 100644 index 00000000..22a4e4df --- /dev/null +++ b/configs/offline/bc/relocate/expert_v1.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: relocate-expert-v1 +eval_freq: 5000 +frac: 1.0 +group: bc-relocate-expert-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 10 +name: BC +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc/relocate/human_v1.yaml b/configs/offline/bc/relocate/human_v1.yaml new file mode 100644 index 00000000..57229b95 --- /dev/null +++ b/configs/offline/bc/relocate/human_v1.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: relocate-human-v1 +eval_freq: 5000 +frac: 1.0 +group: bc-relocate-human-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 10 +name: BC +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc_10/antmaze/large_diverse_v2.yaml b/configs/offline/bc_10/antmaze/large_diverse_v2.yaml new file mode 100644 index 00000000..92ff9537 --- /dev/null +++ b/configs/offline/bc_10/antmaze/large_diverse_v2.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 1.0 +env: antmaze-large-diverse-v2 +eval_freq: 5000 +frac: 0.1 +group: bc-10-antmaze-large-diverse-v2-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 100 +name: BC-10 +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc_10/antmaze/large_play_v0.yaml b/configs/offline/bc_10/antmaze/large_play_v2.yaml similarity index 71% rename from configs/offline/bc_10/antmaze/large_play_v0.yaml rename to configs/offline/bc_10/antmaze/large_play_v2.yaml index d9065dc0..64322f13 100644 --- a/configs/offline/bc_10/antmaze/large_play_v0.yaml +++ b/configs/offline/bc_10/antmaze/large_play_v2.yaml @@ -2,11 +2,11 @@ batch_size: 256 buffer_size: 10000000 checkpoints_path: null device: cuda -discount: 0.99 -env: antmaze-large-play-v0 +discount: 1.0 +env: antmaze-large-play-v2 eval_freq: 5000 frac: 0.1 -group: bc-10-antmaze-large-play-v0-multiseed-v0 +group: bc-10-antmaze-large-play-v2-multiseed-v0 load_model: '' max_timesteps: 1000000 max_traj_len: 1000 diff --git a/configs/offline/bc_10/antmaze/medium_diverse_v2.yaml b/configs/offline/bc_10/antmaze/medium_diverse_v2.yaml new file mode 100644 index 00000000..12edfa67 --- /dev/null +++ b/configs/offline/bc_10/antmaze/medium_diverse_v2.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 1.0 +env: antmaze-medium-diverse-v2 +eval_freq: 5000 +frac: 0.1 +group: bc-10-antmaze-medium-diverse-v2-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 100 +name: BC-10 +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc_10/antmaze/medium_play_v0.yaml b/configs/offline/bc_10/antmaze/medium_play_v2.yaml similarity index 70% rename from configs/offline/bc_10/antmaze/medium_play_v0.yaml rename to configs/offline/bc_10/antmaze/medium_play_v2.yaml index fbb04756..2ff014b7 100644 --- a/configs/offline/bc_10/antmaze/medium_play_v0.yaml +++ b/configs/offline/bc_10/antmaze/medium_play_v2.yaml @@ -2,11 +2,11 @@ batch_size: 256 buffer_size: 10000000 checkpoints_path: null device: cuda -discount: 0.99 -env: antmaze-medium-play-v0 +discount: 1.0 +env: antmaze-medium-play-v2 eval_freq: 5000 frac: 0.1 -group: bc-10-antmaze-medium-play-v0-multiseed-v0 +group: bc-10-antmaze-medium-play-v2-multiseed-v0 load_model: '' max_timesteps: 1000000 max_traj_len: 1000 diff --git a/configs/offline/bc_10/antmaze/umaze_diverse_v2.yaml b/configs/offline/bc_10/antmaze/umaze_diverse_v2.yaml new file mode 100644 index 00000000..6e48aed3 --- /dev/null +++ b/configs/offline/bc_10/antmaze/umaze_diverse_v2.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 1.0 +env: antmaze-umaze-diverse-v2 +eval_freq: 5000 +frac: 0.1 +group: bc-10-antmaze-umaze-diverse-v2-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 100 +name: BC-10 +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc_10/antmaze/umaze_v0.yaml b/configs/offline/bc_10/antmaze/umaze_v2.yaml similarity index 73% rename from configs/offline/bc_10/antmaze/umaze_v0.yaml rename to configs/offline/bc_10/antmaze/umaze_v2.yaml index a6d6d8ae..276645c7 100644 --- a/configs/offline/bc_10/antmaze/umaze_v0.yaml +++ b/configs/offline/bc_10/antmaze/umaze_v2.yaml @@ -2,11 +2,11 @@ batch_size: 256 buffer_size: 10000000 checkpoints_path: null device: cuda -discount: 0.99 -env: antmaze-umaze-v0 +discount: 1.0 +env: antmaze-umaze-v2 eval_freq: 5000 frac: 0.1 -group: bc-10-antmaze-umaze-v0-multiseed-v0 +group: bc-10-antmaze-umaze-v2-multiseed-v0 load_model: '' max_timesteps: 1000000 max_traj_len: 1000 diff --git a/configs/offline/bc_10/door/cloned_v1.yaml b/configs/offline/bc_10/door/cloned_v1.yaml new file mode 100644 index 00000000..a5b91cfe --- /dev/null +++ b/configs/offline/bc_10/door/cloned_v1.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 1.0 +env: door-cloned-v1 +eval_freq: 5000 +frac: 0.1 +group: bc-10-door-cloned-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 10 +name: BC-10 +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc_10/door/expert_v1.yaml b/configs/offline/bc_10/door/expert_v1.yaml new file mode 100644 index 00000000..adfb0a53 --- /dev/null +++ b/configs/offline/bc_10/door/expert_v1.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 1.0 +env: door-expert-v1 +eval_freq: 5000 +frac: 0.1 +group: bc-10-door-expert-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 10 +name: BC-10 +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc_10/door/human_v1.yaml b/configs/offline/bc_10/door/human_v1.yaml new file mode 100644 index 00000000..1b1f54d9 --- /dev/null +++ b/configs/offline/bc_10/door/human_v1.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 1.0 +env: door-human-v1 +eval_freq: 5000 +frac: 0.1 +group: bc-10-door-human-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 10 +name: BC-10 +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc_10/halfcheetah/expert_v2.yaml b/configs/offline/bc_10/halfcheetah/expert_v2.yaml index f01b1fd5..fdf03395 100644 --- a/configs/offline/bc_10/halfcheetah/expert_v2.yaml +++ b/configs/offline/bc_10/halfcheetah/expert_v2.yaml @@ -2,7 +2,7 @@ batch_size: 256 buffer_size: 10000000 checkpoints_path: null device: cuda -discount: 0.99 +discount: 1.0 env: halfcheetah-expert-v2 eval_freq: 5000 frac: 0.1 diff --git a/configs/offline/bc_10/halfcheetah/full_replay_v2.yaml b/configs/offline/bc_10/halfcheetah/full_replay_v2.yaml index 95016d8a..4116d83c 100644 --- a/configs/offline/bc_10/halfcheetah/full_replay_v2.yaml +++ b/configs/offline/bc_10/halfcheetah/full_replay_v2.yaml @@ -2,7 +2,7 @@ batch_size: 256 buffer_size: 10000000 checkpoints_path: null device: cuda -discount: 0.99 +discount: 1.0 env: halfcheetah-full-replay-v2 eval_freq: 5000 frac: 0.1 diff --git a/configs/offline/bc_10/halfcheetah/medium_expert_v2.yaml b/configs/offline/bc_10/halfcheetah/medium_expert_v2.yaml index 294c1526..17d2960a 100644 --- a/configs/offline/bc_10/halfcheetah/medium_expert_v2.yaml +++ b/configs/offline/bc_10/halfcheetah/medium_expert_v2.yaml @@ -2,7 +2,7 @@ batch_size: 256 buffer_size: 10000000 checkpoints_path: null device: cuda -discount: 0.99 +discount: 1.0 env: halfcheetah-medium-expert-v2 eval_freq: 5000 frac: 0.1 diff --git a/configs/offline/bc_10/halfcheetah/medium_replay_v2.yaml b/configs/offline/bc_10/halfcheetah/medium_replay_v2.yaml index e0109c34..af2d1cd6 100644 --- a/configs/offline/bc_10/halfcheetah/medium_replay_v2.yaml +++ b/configs/offline/bc_10/halfcheetah/medium_replay_v2.yaml @@ -2,7 +2,7 @@ batch_size: 256 buffer_size: 10000000 checkpoints_path: null device: cuda -discount: 0.99 +discount: 1.0 env: halfcheetah-medium-replay-v2 eval_freq: 5000 frac: 0.1 diff --git a/configs/offline/bc_10/halfcheetah/medium_v2.yaml b/configs/offline/bc_10/halfcheetah/medium_v2.yaml index 005580ee..70280a2c 100644 --- a/configs/offline/bc_10/halfcheetah/medium_v2.yaml +++ b/configs/offline/bc_10/halfcheetah/medium_v2.yaml @@ -2,7 +2,7 @@ batch_size: 256 buffer_size: 10000000 checkpoints_path: null device: cuda -discount: 0.99 +discount: 1.0 env: halfcheetah-medium-v2 eval_freq: 5000 frac: 0.1 diff --git a/configs/offline/bc_10/halfcheetah/random_v2.yaml b/configs/offline/bc_10/halfcheetah/random_v2.yaml index f0bf1bfb..bb3b122e 100644 --- a/configs/offline/bc_10/halfcheetah/random_v2.yaml +++ b/configs/offline/bc_10/halfcheetah/random_v2.yaml @@ -2,7 +2,7 @@ batch_size: 256 buffer_size: 10000000 checkpoints_path: null device: cuda -discount: 0.99 +discount: 1.0 env: halfcheetah-random-v2 eval_freq: 5000 frac: 0.1 diff --git a/configs/offline/bc_10/hammer/cloned_v1.yaml b/configs/offline/bc_10/hammer/cloned_v1.yaml new file mode 100644 index 00000000..6431d01e --- /dev/null +++ b/configs/offline/bc_10/hammer/cloned_v1.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 1.0 +env: hammer-cloned-v1 +eval_freq: 5000 +frac: 0.1 +group: bc-10-hammer-cloned-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 10 +name: BC-10 +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc_10/hammer/expert_v1.yaml b/configs/offline/bc_10/hammer/expert_v1.yaml new file mode 100644 index 00000000..9dbff8b5 --- /dev/null +++ b/configs/offline/bc_10/hammer/expert_v1.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 1.0 +env: hammer-expert-v1 +eval_freq: 5000 +frac: 0.1 +group: bc-10-hammer-expert-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 10 +name: BC-10 +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc_10/hammer/human_v1.yaml b/configs/offline/bc_10/hammer/human_v1.yaml new file mode 100644 index 00000000..e6a47d36 --- /dev/null +++ b/configs/offline/bc_10/hammer/human_v1.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 1.0 +env: hammer-human-v1 +eval_freq: 5000 +frac: 0.1 +group: bc-10-hammer-human-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 10 +name: BC-10 +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc_10/hopper/expert_v2.yaml b/configs/offline/bc_10/hopper/expert_v2.yaml index 7021622d..dea6ab6d 100644 --- a/configs/offline/bc_10/hopper/expert_v2.yaml +++ b/configs/offline/bc_10/hopper/expert_v2.yaml @@ -2,7 +2,7 @@ batch_size: 256 buffer_size: 10000000 checkpoints_path: null device: cuda -discount: 0.99 +discount: 1.0 env: hopper-expert-v2 eval_freq: 5000 frac: 0.1 diff --git a/configs/offline/bc_10/hopper/full_replay_v2.yaml b/configs/offline/bc_10/hopper/full_replay_v2.yaml index f5c855d1..dc25e115 100644 --- a/configs/offline/bc_10/hopper/full_replay_v2.yaml +++ b/configs/offline/bc_10/hopper/full_replay_v2.yaml @@ -2,7 +2,7 @@ batch_size: 256 buffer_size: 10000000 checkpoints_path: null device: cuda -discount: 0.99 +discount: 1.0 env: hopper-full-replay-v2 eval_freq: 5000 frac: 0.1 diff --git a/configs/offline/bc_10/hopper/medium_expert_v2.yaml b/configs/offline/bc_10/hopper/medium_expert_v2.yaml index 1215d293..f60efe6d 100644 --- a/configs/offline/bc_10/hopper/medium_expert_v2.yaml +++ b/configs/offline/bc_10/hopper/medium_expert_v2.yaml @@ -2,7 +2,7 @@ batch_size: 256 buffer_size: 10000000 checkpoints_path: null device: cuda -discount: 0.99 +discount: 1.0 env: hopper-medium-expert-v2 eval_freq: 5000 frac: 0.1 diff --git a/configs/offline/bc_10/hopper/medium_replay_v2.yaml b/configs/offline/bc_10/hopper/medium_replay_v2.yaml index c5f98dd3..67aec83d 100644 --- a/configs/offline/bc_10/hopper/medium_replay_v2.yaml +++ b/configs/offline/bc_10/hopper/medium_replay_v2.yaml @@ -2,7 +2,7 @@ batch_size: 256 buffer_size: 10000000 checkpoints_path: null device: cuda -discount: 0.99 +discount: 1.0 env: hopper-medium-replay-v2 eval_freq: 5000 frac: 0.1 diff --git a/configs/offline/bc_10/hopper/medium_v2.yaml b/configs/offline/bc_10/hopper/medium_v2.yaml index 724ea8d0..2dd5ce89 100644 --- a/configs/offline/bc_10/hopper/medium_v2.yaml +++ b/configs/offline/bc_10/hopper/medium_v2.yaml @@ -2,7 +2,7 @@ batch_size: 256 buffer_size: 10000000 checkpoints_path: null device: cuda -discount: 0.99 +discount: 1.0 env: hopper-medium-v2 eval_freq: 5000 frac: 0.1 diff --git a/configs/offline/bc_10/hopper/random_v2.yaml b/configs/offline/bc_10/hopper/random_v2.yaml index 639a9a8f..6f1f7d19 100644 --- a/configs/offline/bc_10/hopper/random_v2.yaml +++ b/configs/offline/bc_10/hopper/random_v2.yaml @@ -2,7 +2,7 @@ batch_size: 256 buffer_size: 10000000 checkpoints_path: null device: cuda -discount: 0.99 +discount: 1.0 env: hopper-random-v2 eval_freq: 5000 frac: 0.1 diff --git a/configs/offline/bc_10/maze2d/large_dense_v1.yaml b/configs/offline/bc_10/maze2d/large_dense_v1.yaml index 1dba0c7f..0f059da3 100644 --- a/configs/offline/bc_10/maze2d/large_dense_v1.yaml +++ b/configs/offline/bc_10/maze2d/large_dense_v1.yaml @@ -2,7 +2,7 @@ batch_size: 256 buffer_size: 10000000 checkpoints_path: null device: cuda -discount: 0.99 +discount: 1.0 env: maze2d-large-dense-v1 eval_freq: 5000 frac: 0.1 diff --git a/configs/offline/bc_10/maze2d/large_v1.yaml b/configs/offline/bc_10/maze2d/large_v1.yaml index 4fda491a..9ef4934f 100644 --- a/configs/offline/bc_10/maze2d/large_v1.yaml +++ b/configs/offline/bc_10/maze2d/large_v1.yaml @@ -2,7 +2,7 @@ batch_size: 256 buffer_size: 10000000 checkpoints_path: null device: cuda -discount: 0.99 +discount: 1.0 env: maze2d-large-v1 eval_freq: 5000 frac: 0.1 diff --git a/configs/offline/bc_10/maze2d/medium_dense_v1.yaml b/configs/offline/bc_10/maze2d/medium_dense_v1.yaml index 97683644..54080b6b 100644 --- a/configs/offline/bc_10/maze2d/medium_dense_v1.yaml +++ b/configs/offline/bc_10/maze2d/medium_dense_v1.yaml @@ -2,7 +2,7 @@ batch_size: 256 buffer_size: 10000000 checkpoints_path: null device: cuda -discount: 0.99 +discount: 1.0 env: maze2d-medium-dense-v1 eval_freq: 5000 frac: 0.1 diff --git a/configs/offline/bc_10/maze2d/medium_v1.yaml b/configs/offline/bc_10/maze2d/medium_v1.yaml index 1e8546cf..802541ef 100644 --- a/configs/offline/bc_10/maze2d/medium_v1.yaml +++ b/configs/offline/bc_10/maze2d/medium_v1.yaml @@ -2,7 +2,7 @@ batch_size: 256 buffer_size: 10000000 checkpoints_path: null device: cuda -discount: 0.99 +discount: 1.0 env: maze2d-medium-v1 eval_freq: 5000 frac: 0.1 diff --git a/configs/offline/bc_10/maze2d/umaze_dense_v1.yaml b/configs/offline/bc_10/maze2d/umaze_dense_v1.yaml index ac09cc75..ff10759b 100644 --- a/configs/offline/bc_10/maze2d/umaze_dense_v1.yaml +++ b/configs/offline/bc_10/maze2d/umaze_dense_v1.yaml @@ -2,7 +2,7 @@ batch_size: 256 buffer_size: 10000000 checkpoints_path: null device: cuda -discount: 0.99 +discount: 1.0 env: maze2d-umaze-dense-v1 eval_freq: 5000 frac: 0.1 diff --git a/configs/offline/bc_10/maze2d/umaze_v1.yaml b/configs/offline/bc_10/maze2d/umaze_v1.yaml index 88e8b786..0de9ccc8 100644 --- a/configs/offline/bc_10/maze2d/umaze_v1.yaml +++ b/configs/offline/bc_10/maze2d/umaze_v1.yaml @@ -2,7 +2,7 @@ batch_size: 256 buffer_size: 10000000 checkpoints_path: null device: cuda -discount: 0.99 +discount: 1.0 env: maze2d-umaze-v1 eval_freq: 5000 frac: 0.1 diff --git a/configs/offline/bc_10/pen/cloned_v1.yaml b/configs/offline/bc_10/pen/cloned_v1.yaml new file mode 100644 index 00000000..c7b613fc --- /dev/null +++ b/configs/offline/bc_10/pen/cloned_v1.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 1.0 +env: pen-cloned-v1 +eval_freq: 5000 +frac: 0.1 +group: bc-10-pen-cloned-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 10 +name: BC-10 +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc_10/pen/expert_v1.yaml b/configs/offline/bc_10/pen/expert_v1.yaml new file mode 100644 index 00000000..51bd3bc7 --- /dev/null +++ b/configs/offline/bc_10/pen/expert_v1.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 1.0 +env: pen-expert-v1 +eval_freq: 5000 +frac: 0.1 +group: bc-10-pen-expert-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 10 +name: BC-10 +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc_10/pen/human_v1.yaml b/configs/offline/bc_10/pen/human_v1.yaml new file mode 100644 index 00000000..e1e49909 --- /dev/null +++ b/configs/offline/bc_10/pen/human_v1.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 1.0 +env: pen-human-v1 +eval_freq: 5000 +frac: 0.1 +group: bc-10-pen-human-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 10 +name: BC-10 +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc_10/relocate/cloned_v1.yaml b/configs/offline/bc_10/relocate/cloned_v1.yaml new file mode 100644 index 00000000..c080f6ce --- /dev/null +++ b/configs/offline/bc_10/relocate/cloned_v1.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 1.0 +env: relocate-cloned-v1 +eval_freq: 5000 +frac: 0.1 +group: bc-10-relocate-cloned-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 10 +name: BC-10 +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc_10/relocate/expert_v1.yaml b/configs/offline/bc_10/relocate/expert_v1.yaml new file mode 100644 index 00000000..d35264cf --- /dev/null +++ b/configs/offline/bc_10/relocate/expert_v1.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 1.0 +env: relocate-expert-v1 +eval_freq: 5000 +frac: 0.1 +group: bc-10-relocate-expert-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 10 +name: BC-10 +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc_10/relocate/human_v1.yaml b/configs/offline/bc_10/relocate/human_v1.yaml new file mode 100644 index 00000000..868b1e41 --- /dev/null +++ b/configs/offline/bc_10/relocate/human_v1.yaml @@ -0,0 +1,17 @@ +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 1.0 +env: relocate-human-v1 +eval_freq: 5000 +frac: 0.1 +group: bc-10-relocate-human-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +max_traj_len: 1000 +n_episodes: 10 +name: BC-10 +normalize: true +project: CORL +seed: 0 diff --git a/configs/offline/bc_10/walker2d/expert_v2.yaml b/configs/offline/bc_10/walker2d/expert_v2.yaml index 2a3268c4..6bb794c8 100644 --- a/configs/offline/bc_10/walker2d/expert_v2.yaml +++ b/configs/offline/bc_10/walker2d/expert_v2.yaml @@ -2,7 +2,7 @@ batch_size: 256 buffer_size: 10000000 checkpoints_path: null device: cuda -discount: 0.99 +discount: 1.0 env: walker2d-expert-v2 eval_freq: 5000 frac: 0.1 diff --git a/configs/offline/bc_10/walker2d/full_replay_v2.yaml b/configs/offline/bc_10/walker2d/full_replay_v2.yaml index b13a2c91..8060a521 100644 --- a/configs/offline/bc_10/walker2d/full_replay_v2.yaml +++ b/configs/offline/bc_10/walker2d/full_replay_v2.yaml @@ -2,7 +2,7 @@ batch_size: 256 buffer_size: 10000000 checkpoints_path: null device: cuda -discount: 0.99 +discount: 1.0 env: walker2d-full-replay-v2 eval_freq: 5000 frac: 0.1 diff --git a/configs/offline/bc_10/walker2d/medium_expert_v2.yaml b/configs/offline/bc_10/walker2d/medium_expert_v2.yaml index e5f37612..3ebc9bf6 100644 --- a/configs/offline/bc_10/walker2d/medium_expert_v2.yaml +++ b/configs/offline/bc_10/walker2d/medium_expert_v2.yaml @@ -2,7 +2,7 @@ batch_size: 256 buffer_size: 10000000 checkpoints_path: null device: cuda -discount: 0.99 +discount: 1.0 env: walker2d-medium-expert-v2 eval_freq: 5000 frac: 0.1 diff --git a/configs/offline/bc_10/walker2d/medium_replay_v2.yaml b/configs/offline/bc_10/walker2d/medium_replay_v2.yaml index 9641b7af..3ccd0aee 100644 --- a/configs/offline/bc_10/walker2d/medium_replay_v2.yaml +++ b/configs/offline/bc_10/walker2d/medium_replay_v2.yaml @@ -2,7 +2,7 @@ batch_size: 256 buffer_size: 10000000 checkpoints_path: null device: cuda -discount: 0.99 +discount: 1.0 env: walker2d-medium-replay-v2 eval_freq: 5000 frac: 0.1 diff --git a/configs/offline/bc_10/walker2d/medium_v2.yaml b/configs/offline/bc_10/walker2d/medium_v2.yaml index d9d5139e..d46c0e19 100644 --- a/configs/offline/bc_10/walker2d/medium_v2.yaml +++ b/configs/offline/bc_10/walker2d/medium_v2.yaml @@ -2,7 +2,7 @@ batch_size: 256 buffer_size: 10000000 checkpoints_path: null device: cuda -discount: 0.99 +discount: 1.0 env: walker2d-medium-v2 eval_freq: 5000 frac: 0.1 diff --git a/configs/offline/bc_10/walker2d/random_v2.yaml b/configs/offline/bc_10/walker2d/random_v2.yaml index 192cf3fc..b4f33cf4 100644 --- a/configs/offline/bc_10/walker2d/random_v2.yaml +++ b/configs/offline/bc_10/walker2d/random_v2.yaml @@ -2,7 +2,7 @@ batch_size: 256 buffer_size: 10000000 checkpoints_path: null device: cuda -discount: 0.99 +discount: 1.0 env: walker2d-random-v2 eval_freq: 5000 frac: 0.1 diff --git a/configs/offline/cql/antmaze/large_diverse_v2.yaml b/configs/offline/cql/antmaze/large_diverse_v2.yaml new file mode 100644 index 00000000..eafd3c51 --- /dev/null +++ b/configs/offline/cql/antmaze/large_diverse_v2.yaml @@ -0,0 +1,37 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_clip_diff_max: .inf +cql_clip_diff_min: -200 +cql_importance_sample: true +cql_lagrange: true +cql_max_target_backup: true +cql_alpha: 5.0 +cql_n_actions: 10 +cql_target_action_gap: 0.8 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: antmaze-large-diverse-v2 +eval_freq: 50000 +group: cql-antmaze-large-diverse-v2-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 100 +name: CQL +normalize: false +normalize_reward: true +orthogonal_init: true +policy_lr: 0.0001 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 5 +reward_scale: 10.0 +reward_bias: -5.0 +use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/antmaze/large_play_v2.yaml b/configs/offline/cql/antmaze/large_play_v2.yaml new file mode 100644 index 00000000..ef87f98b --- /dev/null +++ b/configs/offline/cql/antmaze/large_play_v2.yaml @@ -0,0 +1,37 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_clip_diff_max: .inf +cql_clip_diff_min: -200 +cql_importance_sample: true +cql_lagrange: true +cql_max_target_backup: true +cql_alpha: 5.0 +cql_n_actions: 10 +cql_target_action_gap: 0.8 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: antmaze-large-play-v2 +eval_freq: 50000 +group: cql-antmaze-large-play-v2-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 100 +name: CQL +normalize: false +normalize_reward: true +orthogonal_init: true +policy_lr: 0.0001 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 5 +reward_scale: 10.0 +reward_bias: -5.0 +use_automatic_entropy_tuning: true \ No newline at end of file diff --git a/configs/offline/cql/antmaze/medium_diverse_v2.yaml b/configs/offline/cql/antmaze/medium_diverse_v2.yaml new file mode 100644 index 00000000..52fea6d3 --- /dev/null +++ b/configs/offline/cql/antmaze/medium_diverse_v2.yaml @@ -0,0 +1,37 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_clip_diff_max: .inf +cql_clip_diff_min: -200 +cql_importance_sample: true +cql_lagrange: true +cql_max_target_backup: true +cql_alpha: 5.0 +cql_n_actions: 10 +cql_target_action_gap: 0.8 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: antmaze-medium-diverse-v2 +eval_freq: 50000 +group: cql-antmaze-medium-diverse-v2-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 100 +name: CQL +normalize: false +normalize_reward: true +orthogonal_init: true +policy_lr: 0.0001 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 5 +reward_scale: 10.0 +reward_bias: -5.0 +use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/antmaze/medium_play_v2.yaml b/configs/offline/cql/antmaze/medium_play_v2.yaml new file mode 100644 index 00000000..5b64ca66 --- /dev/null +++ b/configs/offline/cql/antmaze/medium_play_v2.yaml @@ -0,0 +1,37 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_clip_diff_max: .inf +cql_clip_diff_min: -200 +cql_importance_sample: true +cql_lagrange: true +cql_max_target_backup: true +cql_alpha: 5.0 +cql_n_actions: 10 +cql_target_action_gap: 0.8 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: antmaze-medium-play-v2 +eval_freq: 50000 +group: cql-antmaze-medium-play-v2-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 100 +name: CQL +normalize: false +normalize_reward: true +orthogonal_init: true +policy_lr: 0.0001 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 5 +reward_scale: 10.0 +reward_bias: -5.0 +use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/antmaze/umaze_diverse_v2.yaml b/configs/offline/cql/antmaze/umaze_diverse_v2.yaml new file mode 100644 index 00000000..abaac81e --- /dev/null +++ b/configs/offline/cql/antmaze/umaze_diverse_v2.yaml @@ -0,0 +1,37 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_clip_diff_max: .inf +cql_clip_diff_min: -200 +cql_importance_sample: true +cql_lagrange: true +cql_max_target_backup: true +cql_alpha: 5.0 +cql_n_actions: 10 +cql_target_action_gap: 0.8 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: antmaze-umaze-diverse-v2 +eval_freq: 50000 +group: cql-antmaze-umaze-diverse-v2-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 100 +name: CQL +normalize: false +normalize_reward: true +orthogonal_init: true +policy_lr: 0.0001 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 5 +reward_scale: 10.0 +reward_bias: -5.0 +use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/antmaze/umaze_v2.yaml b/configs/offline/cql/antmaze/umaze_v2.yaml new file mode 100644 index 00000000..183c65f6 --- /dev/null +++ b/configs/offline/cql/antmaze/umaze_v2.yaml @@ -0,0 +1,37 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_clip_diff_max: .inf +cql_clip_diff_min: -200 +cql_importance_sample: true +cql_lagrange: true +cql_max_target_backup: true +cql_alpha: 5.0 +cql_n_actions: 10 +cql_target_action_gap: 0.8 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: antmaze-umaze-v2 +eval_freq: 50000 +group: cql-antmaze-umaze-v2-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 100 +name: CQL +normalize: false +normalize_reward: true +orthogonal_init: true +policy_lr: 0.0001 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 5 +reward_scale: 10.0 +reward_bias: -5.0 +use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/door/cloned_v1.yaml b/configs/offline/cql/door/cloned_v1.yaml new file mode 100644 index 00000000..11a3e917 --- /dev/null +++ b/configs/offline/cql/door/cloned_v1.yaml @@ -0,0 +1,37 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_clip_diff_max: .inf +cql_clip_diff_min: -.inf +cql_importance_sample: true +cql_lagrange: false +cql_max_target_backup: false +cql_alpha: 1.0 +cql_n_actions: 10 +cql_target_action_gap: -1.0 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: door-cloned-v1 +eval_freq: 5000 +group: cql-door-cloned-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: CQL +normalize: true +normalize_reward: false +orthogonal_init: true +policy_lr: 1.0e-04 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 +use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/door/expert_v1.yaml b/configs/offline/cql/door/expert_v1.yaml new file mode 100644 index 00000000..45d457ee --- /dev/null +++ b/configs/offline/cql/door/expert_v1.yaml @@ -0,0 +1,37 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_clip_diff_max: .inf +cql_clip_diff_min: -.inf +cql_importance_sample: true +cql_lagrange: false +cql_max_target_backup: false +cql_alpha: 1.0 +cql_n_actions: 10 +cql_target_action_gap: -1.0 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: door-expert-v1 +eval_freq: 5000 +group: cql-door-expert-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: CQL +normalize: true +normalize_reward: false +orthogonal_init: true +policy_lr: 1.0e-04 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 +use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/antmaze/umaze_v0.yaml b/configs/offline/cql/door/human_v1.yaml similarity index 75% rename from configs/offline/cql/antmaze/umaze_v0.yaml rename to configs/offline/cql/door/human_v1.yaml index 32f71ad3..d2ed181c 100644 --- a/configs/offline/cql/antmaze/umaze_v0.yaml +++ b/configs/offline/cql/door/human_v1.yaml @@ -9,26 +9,29 @@ cql_clip_diff_min: -.inf cql_importance_sample: true cql_lagrange: false cql_max_target_backup: false -cql_min_q_weight: 10.0 +cql_alpha: 1.0 cql_n_actions: 10 cql_target_action_gap: -1.0 cql_temp: 1.0 device: cuda discount: 0.99 -env: antmaze-umaze-v0 +env: door-human-v1 eval_freq: 5000 -group: cql-antmaze-umaze-v0-multiseed-v0 +group: cql-door-human-v1-multiseed-v0 load_model: '' max_timesteps: 1000000 -n_episodes: 100 +n_episodes: 10 name: CQL normalize: true -normalize_reward: true +normalize_reward: false orthogonal_init: true -policy_lr: 3.0e-05 +policy_lr: 1.0e-04 project: CORL qf_lr: 0.0003 seed: 0 soft_target_update_rate: 0.005 target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/halfcheetah/expert_v2.yaml b/configs/offline/cql/halfcheetah/expert_v2.yaml index fca41056..ecd907f6 100644 --- a/configs/offline/cql/halfcheetah/expert_v2.yaml +++ b/configs/offline/cql/halfcheetah/expert_v2.yaml @@ -9,7 +9,7 @@ cql_clip_diff_min: -.inf cql_importance_sample: true cql_lagrange: false cql_max_target_backup: false -cql_min_q_weight: 10.0 +cql_alpha: 10.0 cql_n_actions: 10 cql_target_action_gap: -1.0 cql_temp: 1.0 @@ -31,4 +31,7 @@ qf_lr: 0.0003 seed: 0 soft_target_update_rate: 0.005 target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/halfcheetah/full_replay_v2.yaml b/configs/offline/cql/halfcheetah/full_replay_v2.yaml index 82ab6385..f7141bda 100644 --- a/configs/offline/cql/halfcheetah/full_replay_v2.yaml +++ b/configs/offline/cql/halfcheetah/full_replay_v2.yaml @@ -9,7 +9,7 @@ cql_clip_diff_min: -.inf cql_importance_sample: true cql_lagrange: false cql_max_target_backup: false -cql_min_q_weight: 10.0 +cql_alpha: 10.0 cql_n_actions: 10 cql_target_action_gap: -1.0 cql_temp: 1.0 @@ -31,4 +31,7 @@ qf_lr: 0.0003 seed: 0 soft_target_update_rate: 0.005 target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/halfcheetah/medium_expert_v2.yaml b/configs/offline/cql/halfcheetah/medium_expert_v2.yaml index d6ed824f..2c01d261 100644 --- a/configs/offline/cql/halfcheetah/medium_expert_v2.yaml +++ b/configs/offline/cql/halfcheetah/medium_expert_v2.yaml @@ -9,7 +9,7 @@ cql_clip_diff_min: -.inf cql_importance_sample: true cql_lagrange: false cql_max_target_backup: false -cql_min_q_weight: 10.0 +cql_alpha: 10.0 cql_n_actions: 10 cql_target_action_gap: -1.0 cql_temp: 1.0 @@ -31,4 +31,7 @@ qf_lr: 0.0003 seed: 0 soft_target_update_rate: 0.005 target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/halfcheetah/medium_replay_v2.yaml b/configs/offline/cql/halfcheetah/medium_replay_v2.yaml index b253a29f..e101525e 100644 --- a/configs/offline/cql/halfcheetah/medium_replay_v2.yaml +++ b/configs/offline/cql/halfcheetah/medium_replay_v2.yaml @@ -9,7 +9,7 @@ cql_clip_diff_min: -.inf cql_importance_sample: true cql_lagrange: false cql_max_target_backup: false -cql_min_q_weight: 10.0 +cql_alpha: 10.0 cql_n_actions: 10 cql_target_action_gap: -1.0 cql_temp: 1.0 @@ -31,4 +31,7 @@ qf_lr: 0.0003 seed: 0 soft_target_update_rate: 0.005 target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/halfcheetah/medium_v2.yaml b/configs/offline/cql/halfcheetah/medium_v2.yaml index 4d91b6c9..beb9ba09 100644 --- a/configs/offline/cql/halfcheetah/medium_v2.yaml +++ b/configs/offline/cql/halfcheetah/medium_v2.yaml @@ -9,7 +9,7 @@ cql_clip_diff_min: -.inf cql_importance_sample: true cql_lagrange: false cql_max_target_backup: false -cql_min_q_weight: 10.0 +cql_alpha: 10.0 cql_n_actions: 10 cql_target_action_gap: -1.0 cql_temp: 1.0 @@ -31,4 +31,7 @@ qf_lr: 0.0003 seed: 0 soft_target_update_rate: 0.005 target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/halfcheetah/random_v2.yaml b/configs/offline/cql/halfcheetah/random_v2.yaml index f8f63d18..e81e689f 100644 --- a/configs/offline/cql/halfcheetah/random_v2.yaml +++ b/configs/offline/cql/halfcheetah/random_v2.yaml @@ -9,7 +9,7 @@ cql_clip_diff_min: -.inf cql_importance_sample: true cql_lagrange: false cql_max_target_backup: false -cql_min_q_weight: 10.0 +cql_alpha: 10.0 cql_n_actions: 10 cql_target_action_gap: -1.0 cql_temp: 1.0 @@ -31,4 +31,7 @@ qf_lr: 0.0003 seed: 0 soft_target_update_rate: 0.005 target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/hammer/cloned_v1.yaml b/configs/offline/cql/hammer/cloned_v1.yaml new file mode 100644 index 00000000..87bc079f --- /dev/null +++ b/configs/offline/cql/hammer/cloned_v1.yaml @@ -0,0 +1,37 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_clip_diff_max: .inf +cql_clip_diff_min: -.inf +cql_importance_sample: true +cql_lagrange: false +cql_max_target_backup: false +cql_alpha: 1.0 +cql_n_actions: 10 +cql_target_action_gap: -1.0 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: hammer-cloned-v1 +eval_freq: 5000 +group: cql-hammer-cloned-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: CQL +normalize: true +normalize_reward: false +orthogonal_init: true +policy_lr: 1.0e-04 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 +use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/hammer/expert_v1.yaml b/configs/offline/cql/hammer/expert_v1.yaml new file mode 100644 index 00000000..798df8be --- /dev/null +++ b/configs/offline/cql/hammer/expert_v1.yaml @@ -0,0 +1,37 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_clip_diff_max: .inf +cql_clip_diff_min: -.inf +cql_importance_sample: true +cql_lagrange: false +cql_max_target_backup: false +cql_alpha: 1.0 +cql_n_actions: 10 +cql_target_action_gap: -1.0 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: hammer-expert-v1 +eval_freq: 5000 +group: cql-hammer-expert-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: CQL +normalize: true +normalize_reward: false +orthogonal_init: true +policy_lr: 1.0e-04 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 +use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/hammer/human_v1.yaml b/configs/offline/cql/hammer/human_v1.yaml new file mode 100644 index 00000000..0ba0182e --- /dev/null +++ b/configs/offline/cql/hammer/human_v1.yaml @@ -0,0 +1,37 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_clip_diff_max: .inf +cql_clip_diff_min: -.inf +cql_importance_sample: true +cql_lagrange: false +cql_max_target_backup: false +cql_alpha: 1.0 +cql_n_actions: 10 +cql_target_action_gap: -1.0 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: hammer-human-v1 +eval_freq: 5000 +group: cql-hammer-human-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: CQL +normalize: true +normalize_reward: false +orthogonal_init: true +policy_lr: 1.0e-04 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 +use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/hopper/expert_v2.yaml b/configs/offline/cql/hopper/expert_v2.yaml index 702ef4fa..b65686ae 100644 --- a/configs/offline/cql/hopper/expert_v2.yaml +++ b/configs/offline/cql/hopper/expert_v2.yaml @@ -9,7 +9,7 @@ cql_clip_diff_min: -.inf cql_importance_sample: true cql_lagrange: false cql_max_target_backup: false -cql_min_q_weight: 10.0 +cql_alpha: 10.0 cql_n_actions: 10 cql_target_action_gap: -1.0 cql_temp: 1.0 @@ -31,4 +31,7 @@ qf_lr: 0.0003 seed: 0 soft_target_update_rate: 0.005 target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/hopper/full_replay_v2.yaml b/configs/offline/cql/hopper/full_replay_v2.yaml index 128afea7..01609c34 100644 --- a/configs/offline/cql/hopper/full_replay_v2.yaml +++ b/configs/offline/cql/hopper/full_replay_v2.yaml @@ -9,7 +9,7 @@ cql_clip_diff_min: -.inf cql_importance_sample: true cql_lagrange: false cql_max_target_backup: false -cql_min_q_weight: 10.0 +cql_alpha: 10.0 cql_n_actions: 10 cql_target_action_gap: -1.0 cql_temp: 1.0 @@ -31,4 +31,7 @@ qf_lr: 0.0003 seed: 0 soft_target_update_rate: 0.005 target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/hopper/medium_expert_v2.yaml b/configs/offline/cql/hopper/medium_expert_v2.yaml index 335a53da..cd2d4480 100644 --- a/configs/offline/cql/hopper/medium_expert_v2.yaml +++ b/configs/offline/cql/hopper/medium_expert_v2.yaml @@ -9,7 +9,7 @@ cql_clip_diff_min: -.inf cql_importance_sample: true cql_lagrange: false cql_max_target_backup: false -cql_min_q_weight: 10.0 +cql_alpha: 10.0 cql_n_actions: 10 cql_target_action_gap: -1.0 cql_temp: 1.0 @@ -31,4 +31,7 @@ qf_lr: 0.0003 seed: 0 soft_target_update_rate: 0.005 target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/hopper/medium_replay_v2.yaml b/configs/offline/cql/hopper/medium_replay_v2.yaml index bad7aca8..c7b90be1 100644 --- a/configs/offline/cql/hopper/medium_replay_v2.yaml +++ b/configs/offline/cql/hopper/medium_replay_v2.yaml @@ -9,7 +9,7 @@ cql_clip_diff_min: -.inf cql_importance_sample: true cql_lagrange: false cql_max_target_backup: false -cql_min_q_weight: 10.0 +cql_alpha: 10.0 cql_n_actions: 10 cql_target_action_gap: -1.0 cql_temp: 1.0 @@ -31,4 +31,7 @@ qf_lr: 0.0003 seed: 0 soft_target_update_rate: 0.005 target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/hopper/medium_v2.yaml b/configs/offline/cql/hopper/medium_v2.yaml index df30ca2f..7adc020f 100644 --- a/configs/offline/cql/hopper/medium_v2.yaml +++ b/configs/offline/cql/hopper/medium_v2.yaml @@ -9,7 +9,7 @@ cql_clip_diff_min: -.inf cql_importance_sample: true cql_lagrange: false cql_max_target_backup: false -cql_min_q_weight: 10.0 +cql_alpha: 10.0 cql_n_actions: 10 cql_target_action_gap: -1.0 cql_temp: 1.0 @@ -31,4 +31,7 @@ qf_lr: 0.0003 seed: 0 soft_target_update_rate: 0.005 target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/hopper/random_v2.yaml b/configs/offline/cql/hopper/random_v2.yaml index fd03cbc1..009a353e 100644 --- a/configs/offline/cql/hopper/random_v2.yaml +++ b/configs/offline/cql/hopper/random_v2.yaml @@ -9,7 +9,7 @@ cql_clip_diff_min: -.inf cql_importance_sample: true cql_lagrange: false cql_max_target_backup: false -cql_min_q_weight: 10.0 +cql_alpha: 10.0 cql_n_actions: 10 cql_target_action_gap: -1.0 cql_temp: 1.0 @@ -31,4 +31,7 @@ qf_lr: 0.0003 seed: 0 soft_target_update_rate: 0.005 target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/maze2d/large_dense_v1.yaml b/configs/offline/cql/maze2d/large_dense_v1.yaml index 905bbebb..b23d0785 100644 --- a/configs/offline/cql/maze2d/large_dense_v1.yaml +++ b/configs/offline/cql/maze2d/large_dense_v1.yaml @@ -9,7 +9,7 @@ cql_clip_diff_min: -.inf cql_importance_sample: true cql_lagrange: true cql_max_target_backup: false -cql_min_q_weight: 10.0 +cql_alpha: 10.0 cql_n_actions: 10 cql_target_action_gap: 5.0 cql_temp: 1.0 @@ -31,4 +31,7 @@ qf_lr: 0.0003 seed: 0 soft_target_update_rate: 0.005 target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/maze2d/large_v1.yaml b/configs/offline/cql/maze2d/large_v1.yaml index 279d6283..f190fb0b 100644 --- a/configs/offline/cql/maze2d/large_v1.yaml +++ b/configs/offline/cql/maze2d/large_v1.yaml @@ -9,7 +9,7 @@ cql_clip_diff_min: -.inf cql_importance_sample: true cql_lagrange: true cql_max_target_backup: false -cql_min_q_weight: 10.0 +cql_alpha: 10.0 cql_n_actions: 10 cql_target_action_gap: 5.0 cql_temp: 1.0 @@ -31,4 +31,7 @@ qf_lr: 0.0003 seed: 0 soft_target_update_rate: 0.005 target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/maze2d/medium_dense_v1.yaml b/configs/offline/cql/maze2d/medium_dense_v1.yaml index c3fd1fad..7977f1bb 100644 --- a/configs/offline/cql/maze2d/medium_dense_v1.yaml +++ b/configs/offline/cql/maze2d/medium_dense_v1.yaml @@ -9,7 +9,7 @@ cql_clip_diff_min: -.inf cql_importance_sample: true cql_lagrange: true cql_max_target_backup: false -cql_min_q_weight: 10.0 +cql_alpha: 10.0 cql_n_actions: 10 cql_target_action_gap: 5.0 cql_temp: 1.0 @@ -31,4 +31,7 @@ qf_lr: 0.0003 seed: 0 soft_target_update_rate: 0.005 target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/maze2d/medium_v1.yaml b/configs/offline/cql/maze2d/medium_v1.yaml index 6a8dbab1..69f023d6 100644 --- a/configs/offline/cql/maze2d/medium_v1.yaml +++ b/configs/offline/cql/maze2d/medium_v1.yaml @@ -9,7 +9,7 @@ cql_clip_diff_min: -.inf cql_importance_sample: true cql_lagrange: true cql_max_target_backup: false -cql_min_q_weight: 10.0 +cql_alpha: 10.0 cql_n_actions: 10 cql_target_action_gap: 5.0 cql_temp: 1.0 @@ -31,4 +31,7 @@ qf_lr: 0.0003 seed: 0 soft_target_update_rate: 0.005 target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/maze2d/umaze_dense_v1.yaml b/configs/offline/cql/maze2d/umaze_dense_v1.yaml index 519f3e29..83b28d4f 100644 --- a/configs/offline/cql/maze2d/umaze_dense_v1.yaml +++ b/configs/offline/cql/maze2d/umaze_dense_v1.yaml @@ -9,7 +9,7 @@ cql_clip_diff_min: -.inf cql_importance_sample: true cql_lagrange: true cql_max_target_backup: false -cql_min_q_weight: 10.0 +cql_alpha: 10.0 cql_n_actions: 10 cql_target_action_gap: 5.0 cql_temp: 1.0 @@ -31,4 +31,7 @@ qf_lr: 0.0003 seed: 0 soft_target_update_rate: 0.005 target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/maze2d/umaze_v1.yaml b/configs/offline/cql/maze2d/umaze_v1.yaml index af2c9707..8587c257 100644 --- a/configs/offline/cql/maze2d/umaze_v1.yaml +++ b/configs/offline/cql/maze2d/umaze_v1.yaml @@ -9,7 +9,7 @@ cql_clip_diff_min: -.inf cql_importance_sample: true cql_lagrange: true cql_max_target_backup: false -cql_min_q_weight: 10.0 +cql_alpha: 10.0 cql_n_actions: 10 cql_target_action_gap: 5.0 cql_temp: 1.0 @@ -31,4 +31,7 @@ qf_lr: 0.0003 seed: 0 soft_target_update_rate: 0.005 target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/antmaze/large_play_v0.yaml b/configs/offline/cql/pen/cloned_v1.yaml similarity index 75% rename from configs/offline/cql/antmaze/large_play_v0.yaml rename to configs/offline/cql/pen/cloned_v1.yaml index 0e4d40af..6897218a 100644 --- a/configs/offline/cql/antmaze/large_play_v0.yaml +++ b/configs/offline/cql/pen/cloned_v1.yaml @@ -9,26 +9,29 @@ cql_clip_diff_min: -.inf cql_importance_sample: true cql_lagrange: false cql_max_target_backup: false -cql_min_q_weight: 10.0 +cql_alpha: 1.0 cql_n_actions: 10 cql_target_action_gap: -1.0 cql_temp: 1.0 device: cuda discount: 0.99 -env: antmaze-large-play-v0 +env: pen-cloned-v1 eval_freq: 5000 -group: cql-antmaze-large-play-v0-multiseed-v0 +group: cql-pen-cloned-v1-multiseed-v0 load_model: '' max_timesteps: 1000000 -n_episodes: 100 +n_episodes: 10 name: CQL normalize: true -normalize_reward: true +normalize_reward: false orthogonal_init: true -policy_lr: 3.0e-05 +policy_lr: 1.0e-04 project: CORL qf_lr: 0.0003 seed: 0 soft_target_update_rate: 0.005 target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/pen/expert_v1.yaml b/configs/offline/cql/pen/expert_v1.yaml new file mode 100644 index 00000000..f0ad2488 --- /dev/null +++ b/configs/offline/cql/pen/expert_v1.yaml @@ -0,0 +1,37 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_clip_diff_max: .inf +cql_clip_diff_min: -.inf +cql_importance_sample: true +cql_lagrange: false +cql_max_target_backup: false +cql_alpha: 1.0 +cql_n_actions: 10 +cql_target_action_gap: -1.0 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: pen-expert-v1 +eval_freq: 5000 +group: cql-pen-expert-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: CQL +normalize: true +normalize_reward: false +orthogonal_init: true +policy_lr: 1.0e-04 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 +use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/antmaze/medium_play_v0.yaml b/configs/offline/cql/pen/human_v1.yaml similarity index 75% rename from configs/offline/cql/antmaze/medium_play_v0.yaml rename to configs/offline/cql/pen/human_v1.yaml index 616e9c48..93245d68 100644 --- a/configs/offline/cql/antmaze/medium_play_v0.yaml +++ b/configs/offline/cql/pen/human_v1.yaml @@ -9,26 +9,29 @@ cql_clip_diff_min: -.inf cql_importance_sample: true cql_lagrange: false cql_max_target_backup: false -cql_min_q_weight: 10.0 +cql_alpha: 1.0 cql_n_actions: 10 cql_target_action_gap: -1.0 cql_temp: 1.0 device: cuda discount: 0.99 -env: antmaze-medium-play-v0 +env: pen-human-v1 eval_freq: 5000 -group: cql-antmaze-medium-play-v0-multiseed-v0 +group: cql-pen-human-v1-multiseed-v0 load_model: '' max_timesteps: 1000000 -n_episodes: 100 +n_episodes: 10 name: CQL normalize: true -normalize_reward: true +normalize_reward: false orthogonal_init: true -policy_lr: 3.0e-05 +policy_lr: 1.0e-04 project: CORL qf_lr: 0.0003 seed: 0 soft_target_update_rate: 0.005 target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/relocate/cloned_v1.yaml b/configs/offline/cql/relocate/cloned_v1.yaml new file mode 100644 index 00000000..0b7fbe23 --- /dev/null +++ b/configs/offline/cql/relocate/cloned_v1.yaml @@ -0,0 +1,37 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_clip_diff_max: .inf +cql_clip_diff_min: -.inf +cql_importance_sample: true +cql_lagrange: false +cql_max_target_backup: false +cql_alpha: 1.0 +cql_n_actions: 10 +cql_target_action_gap: -1.0 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: relocate-cloned-v1 +eval_freq: 5000 +group: cql-relocate-cloned-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: CQL +normalize: true +normalize_reward: false +orthogonal_init: true +policy_lr: 1.0e-04 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 +use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/relocate/expert_v1.yaml b/configs/offline/cql/relocate/expert_v1.yaml new file mode 100644 index 00000000..98a95c53 --- /dev/null +++ b/configs/offline/cql/relocate/expert_v1.yaml @@ -0,0 +1,37 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_clip_diff_max: .inf +cql_clip_diff_min: -.inf +cql_importance_sample: true +cql_lagrange: false +cql_max_target_backup: false +cql_alpha: 1.0 +cql_n_actions: 10 +cql_target_action_gap: -1.0 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: relocate-expert-v1 +eval_freq: 5000 +group: cql-relocate-expert-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: CQL +normalize: true +normalize_reward: false +orthogonal_init: true +policy_lr: 1.0e-04 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 +use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/relocate/human_v1.yaml b/configs/offline/cql/relocate/human_v1.yaml new file mode 100644 index 00000000..152555d2 --- /dev/null +++ b/configs/offline/cql/relocate/human_v1.yaml @@ -0,0 +1,37 @@ +alpha_multiplier: 1.0 +backup_entropy: false +batch_size: 256 +bc_steps: 0 +buffer_size: 10000000 +checkpoints_path: null +cql_clip_diff_max: .inf +cql_clip_diff_min: -.inf +cql_importance_sample: true +cql_lagrange: false +cql_max_target_backup: false +cql_alpha: 1.0 +cql_n_actions: 10 +cql_target_action_gap: -1.0 +cql_temp: 1.0 +device: cuda +discount: 0.99 +env: relocate-human-v1 +eval_freq: 5000 +group: cql-relocate-human-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: CQL +normalize: true +normalize_reward: false +orthogonal_init: true +policy_lr: 1.0e-04 +project: CORL +qf_lr: 0.0003 +seed: 0 +soft_target_update_rate: 0.005 +target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 +use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/walker2d/expert_v2.yaml b/configs/offline/cql/walker2d/expert_v2.yaml index 70689613..08ec2f75 100644 --- a/configs/offline/cql/walker2d/expert_v2.yaml +++ b/configs/offline/cql/walker2d/expert_v2.yaml @@ -9,7 +9,7 @@ cql_clip_diff_min: -.inf cql_importance_sample: true cql_lagrange: false cql_max_target_backup: false -cql_min_q_weight: 10.0 +cql_alpha: 10.0 cql_n_actions: 10 cql_target_action_gap: -1.0 cql_temp: 1.0 @@ -31,4 +31,7 @@ qf_lr: 0.0003 seed: 0 soft_target_update_rate: 0.005 target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/walker2d/full_replay_v2.yaml b/configs/offline/cql/walker2d/full_replay_v2.yaml index a5a0e49f..9cd78ec9 100644 --- a/configs/offline/cql/walker2d/full_replay_v2.yaml +++ b/configs/offline/cql/walker2d/full_replay_v2.yaml @@ -9,7 +9,7 @@ cql_clip_diff_min: -.inf cql_importance_sample: true cql_lagrange: false cql_max_target_backup: false -cql_min_q_weight: 10.0 +cql_alpha: 10.0 cql_n_actions: 10 cql_target_action_gap: -1.0 cql_temp: 1.0 @@ -31,4 +31,7 @@ qf_lr: 0.0003 seed: 0 soft_target_update_rate: 0.005 target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/walker2d/medium_expert_v2.yaml b/configs/offline/cql/walker2d/medium_expert_v2.yaml index 6e4a41c0..4e588da2 100644 --- a/configs/offline/cql/walker2d/medium_expert_v2.yaml +++ b/configs/offline/cql/walker2d/medium_expert_v2.yaml @@ -9,7 +9,7 @@ cql_clip_diff_min: -.inf cql_importance_sample: true cql_lagrange: false cql_max_target_backup: false -cql_min_q_weight: 10.0 +cql_alpha: 10.0 cql_n_actions: 10 cql_target_action_gap: -1.0 cql_temp: 1.0 @@ -31,4 +31,7 @@ qf_lr: 0.0003 seed: 0 soft_target_update_rate: 0.005 target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/walker2d/medium_replay_v2.yaml b/configs/offline/cql/walker2d/medium_replay_v2.yaml index 058f2a99..9f552f17 100644 --- a/configs/offline/cql/walker2d/medium_replay_v2.yaml +++ b/configs/offline/cql/walker2d/medium_replay_v2.yaml @@ -9,7 +9,7 @@ cql_clip_diff_min: -.inf cql_importance_sample: true cql_lagrange: false cql_max_target_backup: false -cql_min_q_weight: 10.0 +cql_alpha: 10.0 cql_n_actions: 10 cql_target_action_gap: -1.0 cql_temp: 1.0 @@ -31,4 +31,7 @@ qf_lr: 0.0003 seed: 0 soft_target_update_rate: 0.005 target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/walker2d/medium_v2.yaml b/configs/offline/cql/walker2d/medium_v2.yaml index 21acf581..f1e04fbd 100644 --- a/configs/offline/cql/walker2d/medium_v2.yaml +++ b/configs/offline/cql/walker2d/medium_v2.yaml @@ -9,7 +9,7 @@ cql_clip_diff_min: -.inf cql_importance_sample: true cql_lagrange: false cql_max_target_backup: false -cql_min_q_weight: 10.0 +cql_alpha: 10.0 cql_n_actions: 10 cql_target_action_gap: -1.0 cql_temp: 1.0 @@ -31,4 +31,7 @@ qf_lr: 0.0003 seed: 0 soft_target_update_rate: 0.005 target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 use_automatic_entropy_tuning: true diff --git a/configs/offline/cql/walker2d/random_v2.yaml b/configs/offline/cql/walker2d/random_v2.yaml index 6f36bdc7..61b250a8 100644 --- a/configs/offline/cql/walker2d/random_v2.yaml +++ b/configs/offline/cql/walker2d/random_v2.yaml @@ -9,7 +9,7 @@ cql_clip_diff_min: -.inf cql_importance_sample: true cql_lagrange: false cql_max_target_backup: false -cql_min_q_weight: 10.0 +cql_alpha: 10.0 cql_n_actions: 10 cql_target_action_gap: -1.0 cql_temp: 1.0 @@ -31,4 +31,7 @@ qf_lr: 0.0003 seed: 0 soft_target_update_rate: 0.005 target_update_period: 1 +q_n_hidden_layers: 3 +reward_scale: 1.0 +reward_bias: 0.0 use_automatic_entropy_tuning: true diff --git a/configs/offline/dt/antmaze/large_diverse_v2.yaml b/configs/offline/dt/antmaze/large_diverse_v2.yaml new file mode 100644 index 00000000..2cdf05ff --- /dev/null +++ b/configs/offline/dt/antmaze/large_diverse_v2.yaml @@ -0,0 +1,33 @@ +attention_dropout: 0.1 +batch_size: 64 +betas: +- 0.9 +- 0.999 +checkpoints_path: null +clip_grad: 0.25 +deterministic_torch: false +device: cuda +embedding_dim: 128 +embedding_dropout: 0.1 +env_name: "antmaze-large-diverse-v2" +episode_len: 1000 +eval_episodes: 100 +eval_every: 10000 +eval_seed: 42 +group: "dt-antmaze-large-diverse-v2-multiseed-v0" +learning_rate: 0.0001 +max_action: 1.0 +name: "DT" +num_heads: 1 +num_layers: 3 +num_workers: 4 +project: "CORL" +residual_dropout: 0.1 +reward_scale: 1.0 +seq_len: 20 +target_returns: [1.0, 0.5, 0.25, 0.125, 0.06, 0.03] +train_seed: 10 +update_steps: 100000 +warmup_steps: 10000 +weight_decay: 0.0001 + diff --git a/configs/offline/dt/antmaze/large_play_v0.yaml b/configs/offline/dt/antmaze/large_play_v2.yaml similarity index 86% rename from configs/offline/dt/antmaze/large_play_v0.yaml rename to configs/offline/dt/antmaze/large_play_v2.yaml index 4ee49ba6..ea857971 100644 --- a/configs/offline/dt/antmaze/large_play_v0.yaml +++ b/configs/offline/dt/antmaze/large_play_v2.yaml @@ -9,12 +9,12 @@ deterministic_torch: false device: cuda embedding_dim: 128 embedding_dropout: 0.1 -env_name: "antmaze-large-play-v0" +env_name: "antmaze-large-play-v2" episode_len: 1000 eval_episodes: 100 eval_every: 10000 eval_seed: 42 -group: "dt-antmaze-large-play-v0-multiseed-v0" +group: "dt-antmaze-large-play-v2-multiseed-v0" learning_rate: 0.0001 max_action: 1.0 name: "DT" diff --git a/configs/offline/dt/antmaze/medium_diverse_v2.yaml b/configs/offline/dt/antmaze/medium_diverse_v2.yaml new file mode 100644 index 00000000..ee329f84 --- /dev/null +++ b/configs/offline/dt/antmaze/medium_diverse_v2.yaml @@ -0,0 +1,32 @@ +attention_dropout: 0.1 +batch_size: 64 +betas: +- 0.9 +- 0.999 +checkpoints_path: null +clip_grad: 0.25 +deterministic_torch: false +device: cuda +embedding_dim: 128 +embedding_dropout: 0.1 +env_name: "antmaze-medium-diverse-v2" +episode_len: 1000 +eval_episodes: 100 +eval_every: 10000 +eval_seed: 42 +group: "dt-antmaze-medium-diverse-v2-multiseed-v0" +learning_rate: 0.0001 +max_action: 1.0 +name: "DT" +num_heads: 1 +num_layers: 3 +num_workers: 4 +project: "CORL" +residual_dropout: 0.1 +reward_scale: 1.0 +seq_len: 20 +target_returns: [1.0, 0.5, 0.25, 0.125, 0.06, 0.03] +train_seed: 10 +update_steps: 100000 +warmup_steps: 10000 +weight_decay: 0.0001 \ No newline at end of file diff --git a/configs/offline/dt/antmaze/medium_play_v0.yaml b/configs/offline/dt/antmaze/medium_play_v2.yaml similarity index 86% rename from configs/offline/dt/antmaze/medium_play_v0.yaml rename to configs/offline/dt/antmaze/medium_play_v2.yaml index 5540f382..d42d92a0 100644 --- a/configs/offline/dt/antmaze/medium_play_v0.yaml +++ b/configs/offline/dt/antmaze/medium_play_v2.yaml @@ -9,12 +9,12 @@ deterministic_torch: false device: cuda embedding_dim: 128 embedding_dropout: 0.1 -env_name: "antmaze-medium-play-v0" +env_name: "antmaze-medium-play-v2" episode_len: 1000 eval_episodes: 100 eval_every: 10000 eval_seed: 42 -group: "dt-antmaze-medium-play-v0-multiseed-v0" +group: "dt-antmaze-medium-play-v2-multiseed-v0" learning_rate: 0.0001 max_action: 1.0 name: "DT" diff --git a/configs/offline/dt/antmaze/umaze_diverse_v2.yaml b/configs/offline/dt/antmaze/umaze_diverse_v2.yaml new file mode 100644 index 00000000..002e25a7 --- /dev/null +++ b/configs/offline/dt/antmaze/umaze_diverse_v2.yaml @@ -0,0 +1,32 @@ +attention_dropout: 0.1 +batch_size: 64 +betas: +- 0.9 +- 0.999 +checkpoints_path: null +clip_grad: 0.25 +deterministic_torch: false +device: cuda +embedding_dim: 128 +embedding_dropout: 0.1 +env_name: "antmaze-umaze-diverse-v2" +episode_len: 1000 +eval_episodes: 100 +eval_every: 10000 +eval_seed: 42 +group: "dt-antmaze-umaze-diverse-v2-multiseed-v0" +learning_rate: 0.0001 +max_action: 1.0 +name: "DT" +num_heads: 1 +num_layers: 3 +num_workers: 4 +project: "CORL" +residual_dropout: 0.1 +reward_scale: 1.0 +seq_len: 20 +target_returns: [1.0, 0.5, 0.25, 0.125, 0.06, 0.03] +train_seed: 10 +update_steps: 100000 +warmup_steps: 10000 +weight_decay: 0.0001 \ No newline at end of file diff --git a/configs/offline/dt/antmaze/umaze_v0.yaml b/configs/offline/dt/antmaze/umaze_v2.yaml similarity index 87% rename from configs/offline/dt/antmaze/umaze_v0.yaml rename to configs/offline/dt/antmaze/umaze_v2.yaml index 9aaeea70..423557a7 100644 --- a/configs/offline/dt/antmaze/umaze_v0.yaml +++ b/configs/offline/dt/antmaze/umaze_v2.yaml @@ -9,12 +9,12 @@ deterministic_torch: false device: cuda embedding_dim: 128 embedding_dropout: 0.1 -env_name: "antmaze-umaze-v0" +env_name: "antmaze-umaze-v2" episode_len: 1000 eval_episodes: 100 eval_every: 10000 eval_seed: 42 -group: "dt-antmaze-umaze-v0-multiseed-v0" +group: "dt-antmaze-umaze-v2-multiseed-v0" learning_rate: 0.0001 max_action: 1.0 name: "DT" diff --git a/configs/offline/dt/door/cloned_v1.yaml b/configs/offline/dt/door/cloned_v1.yaml new file mode 100644 index 00000000..e0d6e284 --- /dev/null +++ b/configs/offline/dt/door/cloned_v1.yaml @@ -0,0 +1,32 @@ +attention_dropout: 0.1 +batch_size: 4096 +betas: +- 0.9 +- 0.999 +checkpoints_path: null +clip_grad: 0.25 +deterministic_torch: false +device: cuda +embedding_dim: 128 +embedding_dropout: 0.1 +env_name: "door-cloned-v1" +episode_len: 1000 +eval_episodes: 100 +eval_every: 5000 +eval_seed: 42 +group: "dt-door-cloned-multiseed-v0" +learning_rate: 0.0008 +max_action: 1.0 +name: "DT" +num_heads: 1 +num_layers: 3 +num_workers: 4 +project: "CORL" +residual_dropout: 0.1 +reward_scale: 1.0 +seq_len: 20 +target_returns: [2900, 1450] +train_seed: 10 +update_steps: 100000 +warmup_steps: 10000 +weight_decay: 0.0001 \ No newline at end of file diff --git a/configs/offline/dt/door/expert_v1.yaml b/configs/offline/dt/door/expert_v1.yaml new file mode 100644 index 00000000..6f1f70b7 --- /dev/null +++ b/configs/offline/dt/door/expert_v1.yaml @@ -0,0 +1,32 @@ +attention_dropout: 0.1 +batch_size: 4096 +betas: +- 0.9 +- 0.999 +checkpoints_path: null +clip_grad: 0.25 +deterministic_torch: false +device: cuda +embedding_dim: 128 +embedding_dropout: 0.1 +env_name: "door-expert-v1" +episode_len: 1000 +eval_episodes: 100 +eval_every: 5000 +eval_seed: 42 +group: "dt-door-expert-multiseed-v0" +learning_rate: 0.0008 +max_action: 1.0 +name: "DT" +num_heads: 1 +num_layers: 3 +num_workers: 4 +project: "CORL" +residual_dropout: 0.1 +reward_scale: 1.0 +seq_len: 20 +target_returns: [2900, 1450] +train_seed: 10 +update_steps: 100000 +warmup_steps: 10000 +weight_decay: 0.0001 \ No newline at end of file diff --git a/configs/offline/dt/door/human_v1.yaml b/configs/offline/dt/door/human_v1.yaml new file mode 100644 index 00000000..a2a2d2b3 --- /dev/null +++ b/configs/offline/dt/door/human_v1.yaml @@ -0,0 +1,32 @@ +attention_dropout: 0.1 +batch_size: 4096 +betas: +- 0.9 +- 0.999 +checkpoints_path: null +clip_grad: 0.25 +deterministic_torch: false +device: cuda +embedding_dim: 128 +embedding_dropout: 0.1 +env_name: "door-human-v1" +episode_len: 1000 +eval_episodes: 100 +eval_every: 5000 +eval_seed: 42 +group: "dt-door-human-multiseed-v0" +learning_rate: 0.0008 +max_action: 1.0 +name: "DT" +num_heads: 1 +num_layers: 3 +num_workers: 4 +project: "CORL" +residual_dropout: 0.1 +reward_scale: 1.0 +seq_len: 20 +target_returns: [2900, 1450] +train_seed: 10 +update_steps: 100000 +warmup_steps: 10000 +weight_decay: 0.0001 \ No newline at end of file diff --git a/configs/offline/dt/halfcheetah/medium_expert_v2.yaml b/configs/offline/dt/halfcheetah/medium_expert_v2.yaml index 900ac056..16fffc1f 100644 --- a/configs/offline/dt/halfcheetah/medium_expert_v2.yaml +++ b/configs/offline/dt/halfcheetah/medium_expert_v2.yaml @@ -23,7 +23,7 @@ num_layers: 3 num_workers: 4 project: "CORL" residual_dropout: 0.1 -reward_scale: 1.0 +reward_scale: 0.001 seq_len: 20 target_returns: [12000.0, 6000.0] train_seed: 10 diff --git a/configs/offline/dt/halfcheetah/medium_replay_v2.yaml b/configs/offline/dt/halfcheetah/medium_replay_v2.yaml index e570322d..6f12d0f1 100644 --- a/configs/offline/dt/halfcheetah/medium_replay_v2.yaml +++ b/configs/offline/dt/halfcheetah/medium_replay_v2.yaml @@ -23,7 +23,7 @@ num_layers: 3 num_workers: 4 project: "CORL" residual_dropout: 0.1 -reward_scale: 1.0 +reward_scale: 0.001 seq_len: 20 target_returns: [12000.0, 6000.0] train_seed: 10 diff --git a/configs/offline/dt/halfcheetah/medium_v2.yaml b/configs/offline/dt/halfcheetah/medium_v2.yaml index c5d46e33..8b6e4417 100644 --- a/configs/offline/dt/halfcheetah/medium_v2.yaml +++ b/configs/offline/dt/halfcheetah/medium_v2.yaml @@ -23,7 +23,7 @@ num_layers: 3 num_workers: 4 project: "CORL" residual_dropout: 0.1 -reward_scale: 1.0 +reward_scale: 0.001 seq_len: 20 target_returns: [12000.0, 6000.0] train_seed: 10 diff --git a/configs/offline/dt/hammer/cloned_v1.yaml b/configs/offline/dt/hammer/cloned_v1.yaml new file mode 100644 index 00000000..750b0390 --- /dev/null +++ b/configs/offline/dt/hammer/cloned_v1.yaml @@ -0,0 +1,32 @@ +attention_dropout: 0.1 +batch_size: 4096 +betas: +- 0.9 +- 0.999 +checkpoints_path: null +clip_grad: 0.25 +deterministic_torch: false +device: cuda +embedding_dim: 128 +embedding_dropout: 0.1 +env_name: "hammer-cloned-v1" +episode_len: 1000 +eval_episodes: 100 +eval_every: 5000 +eval_seed: 42 +group: "dt-hammer-cloned-multiseed-v0" +learning_rate: 0.0008 +max_action: 1.0 +name: "DT" +num_heads: 1 +num_layers: 3 +num_workers: 4 +project: "CORL" +residual_dropout: 0.1 +reward_scale: 1.0 +seq_len: 20 +target_returns: [12800, 6400] +train_seed: 10 +update_steps: 100000 +warmup_steps: 10000 +weight_decay: 0.0001 \ No newline at end of file diff --git a/configs/offline/dt/hammer/expert_v1.yaml b/configs/offline/dt/hammer/expert_v1.yaml new file mode 100644 index 00000000..e209c608 --- /dev/null +++ b/configs/offline/dt/hammer/expert_v1.yaml @@ -0,0 +1,32 @@ +attention_dropout: 0.1 +batch_size: 4096 +betas: +- 0.9 +- 0.999 +checkpoints_path: null +clip_grad: 0.25 +deterministic_torch: false +device: cuda +embedding_dim: 128 +embedding_dropout: 0.1 +env_name: "hammer-expert-v1" +episode_len: 1000 +eval_episodes: 100 +eval_every: 5000 +eval_seed: 42 +group: "dt-hammer-expert-multiseed-v0" +learning_rate: 0.0008 +max_action: 1.0 +name: "DT" +num_heads: 1 +num_layers: 3 +num_workers: 4 +project: "CORL" +residual_dropout: 0.1 +reward_scale: 1.0 +seq_len: 20 +target_returns: [12800, 6400] +train_seed: 10 +update_steps: 100000 +warmup_steps: 10000 +weight_decay: 0.0001 \ No newline at end of file diff --git a/configs/offline/dt/hammer/human_v1.yaml b/configs/offline/dt/hammer/human_v1.yaml new file mode 100644 index 00000000..bcded6d4 --- /dev/null +++ b/configs/offline/dt/hammer/human_v1.yaml @@ -0,0 +1,32 @@ +attention_dropout: 0.1 +batch_size: 4096 +betas: +- 0.9 +- 0.999 +checkpoints_path: null +clip_grad: 0.25 +deterministic_torch: false +device: cuda +embedding_dim: 128 +embedding_dropout: 0.1 +env_name: "hammer-human-v1" +episode_len: 1000 +eval_episodes: 100 +eval_every: 5000 +eval_seed: 42 +group: "dt-hammer-human-multiseed-v0" +learning_rate: 0.0008 +max_action: 1.0 +name: "DT" +num_heads: 1 +num_layers: 3 +num_workers: 4 +project: "CORL" +residual_dropout: 0.1 +reward_scale: 1.0 +seq_len: 20 +target_returns: [12800, 6400] +train_seed: 10 +update_steps: 100000 +warmup_steps: 10000 +weight_decay: 0.0001 \ No newline at end of file diff --git a/configs/offline/dt/hopper/medium_expert_v2.yaml b/configs/offline/dt/hopper/medium_expert_v2.yaml index 18d9d07e..3a7b7d58 100644 --- a/configs/offline/dt/hopper/medium_expert_v2.yaml +++ b/configs/offline/dt/hopper/medium_expert_v2.yaml @@ -23,7 +23,7 @@ num_layers: 3 num_workers: 4 project: "CORL" residual_dropout: 0.1 -reward_scale: 1.0 +reward_scale: 0.001 seq_len: 20 target_returns: [3600.0, 1800.0] train_seed: 10 diff --git a/configs/offline/dt/hopper/medium_replay_v2.yaml b/configs/offline/dt/hopper/medium_replay_v2.yaml index b9311d1c..8f61412b 100644 --- a/configs/offline/dt/hopper/medium_replay_v2.yaml +++ b/configs/offline/dt/hopper/medium_replay_v2.yaml @@ -23,7 +23,7 @@ num_layers: 3 num_workers: 4 project: "CORL" residual_dropout: 0.1 -reward_scale: 1.0 +reward_scale: 0.001 seq_len: 20 target_returns: [3600.0, 1800.0] train_seed: 10 diff --git a/configs/offline/dt/hopper/medium_v2.yaml b/configs/offline/dt/hopper/medium_v2.yaml index 7e8fa9f3..58c532dc 100644 --- a/configs/offline/dt/hopper/medium_v2.yaml +++ b/configs/offline/dt/hopper/medium_v2.yaml @@ -23,7 +23,7 @@ num_layers: 3 num_workers: 4 project: "CORL" residual_dropout: 0.1 -reward_scale: 1.0 +reward_scale: 0.001 seq_len: 20 target_returns: [3600.0, 1800.0] train_seed: 10 diff --git a/configs/offline/dt/pen/cloned_v1.yaml b/configs/offline/dt/pen/cloned_v1.yaml new file mode 100644 index 00000000..fd319d52 --- /dev/null +++ b/configs/offline/dt/pen/cloned_v1.yaml @@ -0,0 +1,32 @@ +attention_dropout: 0.1 +batch_size: 4096 +betas: +- 0.9 +- 0.999 +checkpoints_path: null +clip_grad: 0.25 +deterministic_torch: false +device: cuda +embedding_dim: 128 +embedding_dropout: 0.1 +env_name: "pen-cloned-v1" +episode_len: 1000 +eval_episodes: 100 +eval_every: 5000 +eval_seed: 42 +group: "dt-pen-cloned-multiseed-v0" +learning_rate: 0.0008 +max_action: 1.0 +name: "DT" +num_heads: 1 +num_layers: 3 +num_workers: 4 +project: "CORL" +residual_dropout: 0.1 +reward_scale: 1.0 +seq_len: 20 +target_returns: [3100, 1550] +train_seed: 10 +update_steps: 100000 +warmup_steps: 10000 +weight_decay: 0.0001 \ No newline at end of file diff --git a/configs/offline/dt/pen/expert_v1.yaml b/configs/offline/dt/pen/expert_v1.yaml new file mode 100644 index 00000000..d0e76800 --- /dev/null +++ b/configs/offline/dt/pen/expert_v1.yaml @@ -0,0 +1,32 @@ +attention_dropout: 0.1 +batch_size: 4096 +betas: +- 0.9 +- 0.999 +checkpoints_path: null +clip_grad: 0.25 +deterministic_torch: false +device: cuda +embedding_dim: 128 +embedding_dropout: 0.1 +env_name: "pen-expert-v1" +episode_len: 1000 +eval_episodes: 100 +eval_every: 5000 +eval_seed: 42 +group: "dt-pen-expert-multiseed-v0" +learning_rate: 0.0008 +max_action: 1.0 +name: "DT" +num_heads: 1 +num_layers: 3 +num_workers: 4 +project: "CORL" +residual_dropout: 0.1 +reward_scale: 1.0 +seq_len: 20 +target_returns: [3100, 1550] +train_seed: 10 +update_steps: 100000 +warmup_steps: 10000 +weight_decay: 0.0001 \ No newline at end of file diff --git a/configs/offline/dt/pen/human_v1.yaml b/configs/offline/dt/pen/human_v1.yaml new file mode 100644 index 00000000..f817151e --- /dev/null +++ b/configs/offline/dt/pen/human_v1.yaml @@ -0,0 +1,32 @@ +attention_dropout: 0.1 +batch_size: 4096 +betas: +- 0.9 +- 0.999 +checkpoints_path: null +clip_grad: 0.25 +deterministic_torch: false +device: cuda +embedding_dim: 128 +embedding_dropout: 0.1 +env_name: "pen-human-v1" +episode_len: 1000 +eval_episodes: 100 +eval_every: 5000 +eval_seed: 42 +group: "dt-pen-human-multiseed-v0" +learning_rate: 0.0008 +max_action: 1.0 +name: "DT" +num_heads: 1 +num_layers: 3 +num_workers: 4 +project: "CORL" +residual_dropout: 0.1 +reward_scale: 1.0 +seq_len: 20 +target_returns: [3100, 1550] +train_seed: 10 +update_steps: 100000 +warmup_steps: 10000 +weight_decay: 0.0001 \ No newline at end of file diff --git a/configs/offline/dt/relocate/cloned_v1.yaml b/configs/offline/dt/relocate/cloned_v1.yaml new file mode 100644 index 00000000..2b64284b --- /dev/null +++ b/configs/offline/dt/relocate/cloned_v1.yaml @@ -0,0 +1,32 @@ +attention_dropout: 0.1 +batch_size: 4096 +betas: +- 0.9 +- 0.999 +checkpoints_path: null +clip_grad: 0.25 +deterministic_torch: false +device: cuda +embedding_dim: 128 +embedding_dropout: 0.1 +env_name: "relocate-cloned-v1" +episode_len: 1000 +eval_episodes: 100 +eval_every: 5000 +eval_seed: 42 +group: "dt-relocate-cloned-multiseed-v0" +learning_rate: 0.0008 +max_action: 1.0 +name: "DT" +num_heads: 1 +num_layers: 3 +num_workers: 4 +project: "CORL" +residual_dropout: 0.1 +reward_scale: 1.0 +seq_len: 20 +target_returns: [4300, 2150] +train_seed: 10 +update_steps: 100000 +warmup_steps: 10000 +weight_decay: 0.0001 \ No newline at end of file diff --git a/configs/offline/dt/relocate/expert_v1.yaml b/configs/offline/dt/relocate/expert_v1.yaml new file mode 100644 index 00000000..bd2fb153 --- /dev/null +++ b/configs/offline/dt/relocate/expert_v1.yaml @@ -0,0 +1,32 @@ +attention_dropout: 0.1 +batch_size: 4096 +betas: +- 0.9 +- 0.999 +checkpoints_path: null +clip_grad: 0.25 +deterministic_torch: false +device: cuda +embedding_dim: 128 +embedding_dropout: 0.1 +env_name: "relocate-expert-v1" +episode_len: 1000 +eval_episodes: 100 +eval_every: 5000 +eval_seed: 42 +group: "dt-relocate-expert-multiseed-v0" +learning_rate: 0.0008 +max_action: 1.0 +name: "DT" +num_heads: 1 +num_layers: 3 +num_workers: 4 +project: "CORL" +residual_dropout: 0.1 +reward_scale: 1.0 +seq_len: 20 +target_returns: [4300, 2150] +train_seed: 10 +update_steps: 100000 +warmup_steps: 10000 +weight_decay: 0.0001 \ No newline at end of file diff --git a/configs/offline/dt/relocate/human_v1.yaml b/configs/offline/dt/relocate/human_v1.yaml new file mode 100644 index 00000000..a8c590b5 --- /dev/null +++ b/configs/offline/dt/relocate/human_v1.yaml @@ -0,0 +1,32 @@ +attention_dropout: 0.1 +batch_size: 4096 +betas: +- 0.9 +- 0.999 +checkpoints_path: null +clip_grad: 0.25 +deterministic_torch: false +device: cuda +embedding_dim: 128 +embedding_dropout: 0.1 +env_name: "relocate-human-v1" +episode_len: 1000 +eval_episodes: 100 +eval_every: 5000 +eval_seed: 42 +group: "dt-relocate-human-multiseed-v0" +learning_rate: 0.0008 +max_action: 1.0 +name: "DT" +num_heads: 1 +num_layers: 3 +num_workers: 4 +project: "CORL" +residual_dropout: 0.1 +reward_scale: 1.0 +seq_len: 20 +target_returns: [4300, 2150] +train_seed: 10 +update_steps: 100000 +warmup_steps: 10000 +weight_decay: 0.0001 \ No newline at end of file diff --git a/configs/offline/dt/walker2d/medium_expert_v2.yaml b/configs/offline/dt/walker2d/medium_expert_v2.yaml index 32e1e38b..2ac29a9d 100644 --- a/configs/offline/dt/walker2d/medium_expert_v2.yaml +++ b/configs/offline/dt/walker2d/medium_expert_v2.yaml @@ -23,7 +23,7 @@ num_layers: 3 num_workers: 4 project: "CORL" residual_dropout: 0.1 -reward_scale: 1.0 +reward_scale: 0.001 seq_len: 20 target_returns: [5000.0, 2500.0] train_seed: 10 diff --git a/configs/offline/dt/walker2d/medium_replay_v2.yaml b/configs/offline/dt/walker2d/medium_replay_v2.yaml index a703f403..daa394da 100644 --- a/configs/offline/dt/walker2d/medium_replay_v2.yaml +++ b/configs/offline/dt/walker2d/medium_replay_v2.yaml @@ -23,7 +23,7 @@ num_layers: 3 num_workers: 4 project: "CORL" residual_dropout: 0.1 -reward_scale: 1.0 +reward_scale: 0.001 seq_len: 20 target_returns: [5000.0, 2500.0] train_seed: 10 diff --git a/configs/offline/dt/walker2d/medium_v2.yaml b/configs/offline/dt/walker2d/medium_v2.yaml index a06b064e..630b8685 100644 --- a/configs/offline/dt/walker2d/medium_v2.yaml +++ b/configs/offline/dt/walker2d/medium_v2.yaml @@ -23,7 +23,7 @@ num_layers: 3 num_workers: 4 project: "CORL" residual_dropout: 0.1 -reward_scale: 1.0 +reward_scale: 0.001 seq_len: 20 target_returns: [5000.0, 2500.0] train_seed: 10 diff --git a/configs/offline/edac/antmaze/large_diverse_v2.yaml b/configs/offline/edac/antmaze/large_diverse_v2.yaml new file mode 100644 index 00000000..191ca89a --- /dev/null +++ b/configs/offline/edac/antmaze/large_diverse_v2.yaml @@ -0,0 +1,26 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "antmaze-large-diverse-v2" +eta: 1.0 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "edac-antmaze-large-diverse-v2-multiseed-v0" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "EDAC" +normalize_reward: false +num_critics: 10 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/edac/antmaze/large_play_v0.yaml b/configs/offline/edac/antmaze/large_play_v2.yaml similarity index 84% rename from configs/offline/edac/antmaze/large_play_v0.yaml rename to configs/offline/edac/antmaze/large_play_v2.yaml index 1d01d2ae..cc2088eb 100644 --- a/configs/offline/edac/antmaze/large_play_v0.yaml +++ b/configs/offline/edac/antmaze/large_play_v2.yaml @@ -6,13 +6,13 @@ checkpoints_path: null critic_learning_rate: 0.0003 deterministic_torch: false device: cuda -env_name: "antmaze-large-play-v0" +env_name: "antmaze-large-play-v2" eta: 1.0 eval_episodes: 10 eval_every: 5 eval_seed: 42 gamma: 0.99 -group: "edac-antmaze-large-play-v0-multiseed-v0" +group: "edac-antmaze-large-play-v2-multiseed-v0" hidden_dim: 256 log_every: 100 max_action: 1.0 diff --git a/configs/offline/edac/antmaze/medium_diverse_v2.yaml b/configs/offline/edac/antmaze/medium_diverse_v2.yaml new file mode 100644 index 00000000..017894c5 --- /dev/null +++ b/configs/offline/edac/antmaze/medium_diverse_v2.yaml @@ -0,0 +1,26 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "antmaze-medium-diverse-v2" +eta: 1.0 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "edac-antmaze-medium-diverse-v2-multiseed-v0" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "EDAC" +normalize_reward: false +num_critics: 10 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/edac/antmaze/medium_play_v0.yaml b/configs/offline/edac/antmaze/medium_play_v2.yaml similarity index 83% rename from configs/offline/edac/antmaze/medium_play_v0.yaml rename to configs/offline/edac/antmaze/medium_play_v2.yaml index 8b97cf24..e2f07f6d 100644 --- a/configs/offline/edac/antmaze/medium_play_v0.yaml +++ b/configs/offline/edac/antmaze/medium_play_v2.yaml @@ -6,13 +6,13 @@ checkpoints_path: null critic_learning_rate: 0.0003 deterministic_torch: false device: cuda -env_name: "antmaze-medium-play-v0" +env_name: "antmaze-medium-play-v2" eta: 1.0 eval_episodes: 10 eval_every: 5 eval_seed: 42 gamma: 0.99 -group: "edac-antmaze-medium-play-v0-multiseed-v0" +group: "edac-antmaze-medium-play-v2-multiseed-v0" hidden_dim: 256 log_every: 100 max_action: 1.0 diff --git a/configs/offline/edac/antmaze/umaze_diverse_v2.yaml b/configs/offline/edac/antmaze/umaze_diverse_v2.yaml new file mode 100644 index 00000000..250cf38b --- /dev/null +++ b/configs/offline/edac/antmaze/umaze_diverse_v2.yaml @@ -0,0 +1,26 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "antmaze-umaze-diverse-v2" +eta: 1.0 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "edac-antmaze-umaze-diverse-v2-multiseed-v0" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "EDAC" +normalize_reward: false +num_critics: 10 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/edac/antmaze/umaze_v0.yaml b/configs/offline/edac/antmaze/umaze_v2.yaml similarity index 85% rename from configs/offline/edac/antmaze/umaze_v0.yaml rename to configs/offline/edac/antmaze/umaze_v2.yaml index 5cf4964d..b74a4a32 100644 --- a/configs/offline/edac/antmaze/umaze_v0.yaml +++ b/configs/offline/edac/antmaze/umaze_v2.yaml @@ -6,13 +6,13 @@ checkpoints_path: null critic_learning_rate: 0.0003 deterministic_torch: false device: cuda -env_name: "antmaze-umaze-v0" +env_name: "antmaze-umaze-v2" eta: 1.0 eval_episodes: 10 eval_every: 5 eval_seed: 42 gamma: 0.99 -group: "edac-antmaze-umaze-v0-multiseed-v0" +group: "edac-antmaze-umaze-v2-multiseed-v0" hidden_dim: 256 log_every: 100 max_action: 1.0 diff --git a/configs/offline/edac/door/cloned_v1.yaml b/configs/offline/edac/door/cloned_v1.yaml new file mode 100644 index 00000000..ec72d583 --- /dev/null +++ b/configs/offline/edac/door/cloned_v1.yaml @@ -0,0 +1,26 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 2000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "door-cloned-v1" +eta: 200.0 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "edac-door-cloned-v1-multiseed-v2" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "EDAC" +normalize_reward: false +num_critics: 50 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/edac/door/expert_v1.yaml b/configs/offline/edac/door/expert_v1.yaml new file mode 100644 index 00000000..60ab283e --- /dev/null +++ b/configs/offline/edac/door/expert_v1.yaml @@ -0,0 +1,26 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 2000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "door-expert-v1" +eta: 200.0 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "edac-door-expert-v1-multiseed-v2" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "EDAC" +normalize_reward: false +num_critics: 50 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/edac/door/human_v1.yaml b/configs/offline/edac/door/human_v1.yaml new file mode 100644 index 00000000..f32de93d --- /dev/null +++ b/configs/offline/edac/door/human_v1.yaml @@ -0,0 +1,26 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 2000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "door-human-v1" +eta: 200.0 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "edac-door-human-v1-multiseed-v2" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "EDAC" +normalize_reward: false +num_critics: 50 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/edac/hammer/cloned_v1.yaml b/configs/offline/edac/hammer/cloned_v1.yaml new file mode 100644 index 00000000..f292b523 --- /dev/null +++ b/configs/offline/edac/hammer/cloned_v1.yaml @@ -0,0 +1,26 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 2000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "hammer-cloned-v1" +eta: 200.0 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "edac-hammer-cloned-v1-multiseed-v2" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "EDAC" +normalize_reward: false +num_critics: 50 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/edac/hammer/expert_v1.yaml b/configs/offline/edac/hammer/expert_v1.yaml new file mode 100644 index 00000000..715616d7 --- /dev/null +++ b/configs/offline/edac/hammer/expert_v1.yaml @@ -0,0 +1,26 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 2000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "hammer-expert-v1" +eta: 200.0 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "edac-hammer-expert-v1-multiseed-v2" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "EDAC" +normalize_reward: false +num_critics: 50 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/edac/hammer/human_v1.yaml b/configs/offline/edac/hammer/human_v1.yaml new file mode 100644 index 00000000..405352f1 --- /dev/null +++ b/configs/offline/edac/hammer/human_v1.yaml @@ -0,0 +1,26 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 2000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "hammer-human-v1" +eta: 200.0 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "edac-hammer-human-v1-multiseed-v2" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "EDAC" +normalize_reward: false +num_critics: 50 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/edac/pen/cloned_v1.yaml b/configs/offline/edac/pen/cloned_v1.yaml new file mode 100644 index 00000000..f7fa7d33 --- /dev/null +++ b/configs/offline/edac/pen/cloned_v1.yaml @@ -0,0 +1,26 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 2000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "pen-cloned-v1" +eta: 10.0 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "edac-pen-cloned-v1-multiseed-v2" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "EDAC" +normalize_reward: false +num_critics: 20 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/edac/pen/expert_v1.yaml b/configs/offline/edac/pen/expert_v1.yaml new file mode 100644 index 00000000..74375116 --- /dev/null +++ b/configs/offline/edac/pen/expert_v1.yaml @@ -0,0 +1,26 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 2000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "pen-expert-v1" +eta: 10.0 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "edac-pen-expert-v1-multiseed-v2" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "EDAC" +normalize_reward: false +num_critics: 20 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/edac/pen/human_v1.yaml b/configs/offline/edac/pen/human_v1.yaml new file mode 100644 index 00000000..61a83cf7 --- /dev/null +++ b/configs/offline/edac/pen/human_v1.yaml @@ -0,0 +1,26 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 2000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "pen-human-v1" +eta: 1000.0 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "edac-pen-human-v1-multiseed-v2" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "EDAC" +normalize_reward: false +num_critics: 20 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/edac/relocate/cloned_v1.yaml b/configs/offline/edac/relocate/cloned_v1.yaml new file mode 100644 index 00000000..60b2c67f --- /dev/null +++ b/configs/offline/edac/relocate/cloned_v1.yaml @@ -0,0 +1,26 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 2000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "relocate-cloned-v1" +eta: 200.0 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "edac-relocate-cloned-v1-multiseed-v2" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "EDAC" +normalize_reward: false +num_critics: 50 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/edac/relocate/expert_v1.yaml b/configs/offline/edac/relocate/expert_v1.yaml new file mode 100644 index 00000000..6be80844 --- /dev/null +++ b/configs/offline/edac/relocate/expert_v1.yaml @@ -0,0 +1,26 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 2000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "relocate-expert-v1" +eta: 200.0 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "edac-relocate-expert-v1-multiseed-v2" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "EDAC" +normalize_reward: false +num_critics: 50 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/edac/relocate/human_v1.yaml b/configs/offline/edac/relocate/human_v1.yaml new file mode 100644 index 00000000..36979f7c --- /dev/null +++ b/configs/offline/edac/relocate/human_v1.yaml @@ -0,0 +1,26 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 2000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "relocate-human-v1" +eta: 200.0 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "edac-relocate-human-v1-multiseed-v2" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "EDAC" +normalize_reward: false +num_critics: 50 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/iql/antmaze/large_diverse_v2.yaml b/configs/offline/iql/antmaze/large_diverse_v2.yaml new file mode 100644 index 00000000..8284014c --- /dev/null +++ b/configs/offline/iql/antmaze/large_diverse_v2.yaml @@ -0,0 +1,23 @@ +actor_lr: 3e-4 +batch_size: 256 +beta: 10.0 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: antmaze-large-diverse-v2 +eval_freq: 5000 +group: iql-antmaze-large-diverse-v2-multiseed-v0 +iql_deterministic: false +iql_tau: 0.9 +load_model: '' +max_timesteps: 1000000 +n_episodes: 100 +name: IQL +normalize: true +normalize_reward: true +qf_lr: 3e-4 +project: CORL +seed: 0 +tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/antmaze/medium_play_v0.yaml b/configs/offline/iql/antmaze/large_play_v2.yaml similarity index 72% rename from configs/offline/iql/antmaze/medium_play_v0.yaml rename to configs/offline/iql/antmaze/large_play_v2.yaml index d637421f..9172ea6b 100644 --- a/configs/offline/iql/antmaze/medium_play_v0.yaml +++ b/configs/offline/iql/antmaze/large_play_v2.yaml @@ -1,12 +1,13 @@ +actor_lr: 3e-4 batch_size: 256 beta: 10.0 buffer_size: 10000000 checkpoints_path: null device: cuda discount: 0.99 -env: antmaze-medium-play-v0 +env: antmaze-large-play-v2 eval_freq: 5000 -group: iql-antmaze-medium-play-v0-multiseed-v0 +group: iql-antmaze-large-play-v2-multiseed-v0 iql_deterministic: false iql_tau: 0.9 load_model: '' @@ -15,6 +16,8 @@ n_episodes: 100 name: IQL normalize: true normalize_reward: true +qf_lr: 3e-4 project: CORL seed: 0 tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/antmaze/medium_diverse_v2.yaml b/configs/offline/iql/antmaze/medium_diverse_v2.yaml new file mode 100644 index 00000000..90eb96d4 --- /dev/null +++ b/configs/offline/iql/antmaze/medium_diverse_v2.yaml @@ -0,0 +1,23 @@ +actor_lr: 3e-4 +batch_size: 256 +beta: 10.0 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: antmaze-medium-diverse-v2 +eval_freq: 5000 +group: iql-antmaze-medium-diverse-v2-multiseed-v0 +iql_deterministic: false +iql_tau: 0.9 +load_model: '' +max_timesteps: 1000000 +n_episodes: 100 +name: IQL +normalize: true +normalize_reward: true +qf_lr: 3e-4 +project: CORL +seed: 0 +tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/antmaze/umaze_v0.yaml b/configs/offline/iql/antmaze/medium_play_v2.yaml similarity index 71% rename from configs/offline/iql/antmaze/umaze_v0.yaml rename to configs/offline/iql/antmaze/medium_play_v2.yaml index 8d39d4a2..ee72ff97 100644 --- a/configs/offline/iql/antmaze/umaze_v0.yaml +++ b/configs/offline/iql/antmaze/medium_play_v2.yaml @@ -1,12 +1,13 @@ +actor_lr: 3e-4 batch_size: 256 beta: 10.0 buffer_size: 10000000 checkpoints_path: null device: cuda discount: 0.99 -env: antmaze-umaze-v0 +env: antmaze-medium-play-v2 eval_freq: 5000 -group: iql-antmaze-umaze-v0-multiseed-v0 +group: iql-antmaze-medium-play-v2-multiseed-v0 iql_deterministic: false iql_tau: 0.9 load_model: '' @@ -15,6 +16,8 @@ n_episodes: 100 name: IQL normalize: true normalize_reward: true +qf_lr: 3e-4 project: CORL seed: 0 tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/antmaze/umaze_diverse_v2.yaml b/configs/offline/iql/antmaze/umaze_diverse_v2.yaml new file mode 100644 index 00000000..22213e18 --- /dev/null +++ b/configs/offline/iql/antmaze/umaze_diverse_v2.yaml @@ -0,0 +1,23 @@ +actor_lr: 3e-4 +batch_size: 256 +beta: 10.0 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: antmaze-umaze-diverse-v2 +eval_freq: 5000 +group: iql-antmaze-umaze-diverse-v2-multiseed-v0 +iql_deterministic: false +iql_tau: 0.9 +load_model: '' +max_timesteps: 1000000 +n_episodes: 100 +name: IQL +normalize: true +normalize_reward: true +qf_lr: 3e-4 +project: CORL +seed: 0 +tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/antmaze/large_play_v0.yaml b/configs/offline/iql/antmaze/umaze_v2.yaml similarity index 73% rename from configs/offline/iql/antmaze/large_play_v0.yaml rename to configs/offline/iql/antmaze/umaze_v2.yaml index 2a5edf2d..7cc0169e 100644 --- a/configs/offline/iql/antmaze/large_play_v0.yaml +++ b/configs/offline/iql/antmaze/umaze_v2.yaml @@ -1,12 +1,13 @@ +actor_lr: 3e-4 batch_size: 256 beta: 10.0 buffer_size: 10000000 checkpoints_path: null device: cuda discount: 0.99 -env: antmaze-large-play-v0 +env: antmaze-umaze-v2 eval_freq: 5000 -group: iql-antmaze-large-play-v0-multiseed-v0 +group: iql-antmaze-umaze-v2-multiseed-v0 iql_deterministic: false iql_tau: 0.9 load_model: '' @@ -15,6 +16,8 @@ n_episodes: 100 name: IQL normalize: true normalize_reward: true +qf_lr: 3e-4 project: CORL seed: 0 tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/door/cloned_v1.yaml b/configs/offline/iql/door/cloned_v1.yaml new file mode 100644 index 00000000..c96f83c9 --- /dev/null +++ b/configs/offline/iql/door/cloned_v1.yaml @@ -0,0 +1,23 @@ +actor_lr: 3e-4 +actor_dropout: 0.1 +batch_size: 256 +beta: 3.0 +buffer_size: 10000000 +device: cuda +discount: 0.99 +env: door-cloned-v1 +eval_freq: 5000 +group: iql-adroit-door-cloned-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: IQL +normalize: true +normalize_reward: false +qf_lr: 3e-4 +project: CORL +seed: 0 +tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/door/expert_v1.yaml b/configs/offline/iql/door/expert_v1.yaml new file mode 100644 index 00000000..53c0de59 --- /dev/null +++ b/configs/offline/iql/door/expert_v1.yaml @@ -0,0 +1,23 @@ +actor_lr: 3e-4 +actor_dropout: 0.1 +batch_size: 256 +beta: 3.0 +buffer_size: 10000000 +device: cuda +discount: 0.99 +env: door-expert-v1 +eval_freq: 5000 +group: iql-adroit-door-expert-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: IQL +normalize: true +normalize_reward: false +qf_lr: 3e-4 +project: CORL +seed: 0 +tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/door/human_v1.yaml b/configs/offline/iql/door/human_v1.yaml new file mode 100644 index 00000000..1fa6bfb7 --- /dev/null +++ b/configs/offline/iql/door/human_v1.yaml @@ -0,0 +1,23 @@ +actor_lr: 3e-4 +actor_dropout: 0.1 +batch_size: 256 +beta: 3.0 +buffer_size: 10000000 +device: cuda +discount: 0.99 +env: door-human-v1 +eval_freq: 5000 +group: iql-adroit-door-human-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: IQL +normalize: true +normalize_reward: false +qf_lr: 3e-4 +project: CORL +seed: 0 +tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/halfcheetah/expert_v2.yaml b/configs/offline/iql/halfcheetah/expert_v2.yaml index 0b448928..d7cb9666 100644 --- a/configs/offline/iql/halfcheetah/expert_v2.yaml +++ b/configs/offline/iql/halfcheetah/expert_v2.yaml @@ -1,3 +1,4 @@ +actor_lr: 3e-4 batch_size: 256 beta: 3.0 buffer_size: 10000000 @@ -15,6 +16,8 @@ n_episodes: 10 name: IQL normalize: true normalize_reward: false +qf_lr: 3e-4 project: CORL seed: 0 tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/halfcheetah/full_replay_v2.yaml b/configs/offline/iql/halfcheetah/full_replay_v2.yaml index f41ec2cf..34ab8dd7 100644 --- a/configs/offline/iql/halfcheetah/full_replay_v2.yaml +++ b/configs/offline/iql/halfcheetah/full_replay_v2.yaml @@ -1,3 +1,4 @@ +actor_lr: 3e-4 batch_size: 256 beta: 3.0 buffer_size: 10000000 @@ -15,6 +16,8 @@ n_episodes: 10 name: IQL normalize: true normalize_reward: false +qf_lr: 3e-4 project: CORL seed: 0 tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/halfcheetah/medium_expert_v2.yaml b/configs/offline/iql/halfcheetah/medium_expert_v2.yaml index 9669d7ed..b9cb16a4 100644 --- a/configs/offline/iql/halfcheetah/medium_expert_v2.yaml +++ b/configs/offline/iql/halfcheetah/medium_expert_v2.yaml @@ -1,3 +1,4 @@ +actor_lr: 3e-4 batch_size: 256 beta: 3.0 buffer_size: 10000000 @@ -16,6 +17,8 @@ name: IQL normalize: true normalize_reward: false project: CORL +qf_lr: 3e-4 seed: 0 tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/halfcheetah/medium_replay_v2.yaml b/configs/offline/iql/halfcheetah/medium_replay_v2.yaml index 927ed57d..68447a73 100644 --- a/configs/offline/iql/halfcheetah/medium_replay_v2.yaml +++ b/configs/offline/iql/halfcheetah/medium_replay_v2.yaml @@ -1,3 +1,4 @@ +actor_lr: 3e-4 batch_size: 256 beta: 3.0 buffer_size: 10000000 @@ -16,6 +17,8 @@ name: IQL normalize: true normalize_reward: false project: CORL +qf_lr: 3e-4 seed: 0 tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/halfcheetah/medium_v2.yaml b/configs/offline/iql/halfcheetah/medium_v2.yaml index 7021aa47..8c2f2080 100644 --- a/configs/offline/iql/halfcheetah/medium_v2.yaml +++ b/configs/offline/iql/halfcheetah/medium_v2.yaml @@ -1,3 +1,4 @@ +actor_lr: 3e-4 batch_size: 256 beta: 3.0 buffer_size: 10000000 @@ -16,6 +17,8 @@ name: IQL normalize: true normalize_reward: false project: CORL +qf_lr: 3e-4 seed: 0 tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/halfcheetah/random_v2.yaml b/configs/offline/iql/halfcheetah/random_v2.yaml index 961f1015..50ec69e2 100644 --- a/configs/offline/iql/halfcheetah/random_v2.yaml +++ b/configs/offline/iql/halfcheetah/random_v2.yaml @@ -1,3 +1,4 @@ +actor_lr: 3e-4 batch_size: 256 beta: 3.0 buffer_size: 10000000 @@ -15,6 +16,8 @@ n_episodes: 10 name: IQL normalize: true normalize_reward: false +qf_lr: 3e-4 project: CORL seed: 0 tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/hammer/cloned_v1.yaml b/configs/offline/iql/hammer/cloned_v1.yaml new file mode 100644 index 00000000..f5078bc8 --- /dev/null +++ b/configs/offline/iql/hammer/cloned_v1.yaml @@ -0,0 +1,23 @@ +actor_lr: 3e-4 +actor_dropout: 0.1 +batch_size: 256 +beta: 3.0 +buffer_size: 10000000 +device: cuda +discount: 0.99 +env: hammer-cloned-v1 +eval_freq: 5000 +group: iql-adroit-hammer-cloned-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: IQL +normalize: true +normalize_reward: false +qf_lr: 3e-4 +project: CORL +seed: 0 +tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/hammer/expert_v1.yaml b/configs/offline/iql/hammer/expert_v1.yaml new file mode 100644 index 00000000..49e28b6c --- /dev/null +++ b/configs/offline/iql/hammer/expert_v1.yaml @@ -0,0 +1,23 @@ +actor_lr: 3e-4 +actor_dropout: 0.1 +batch_size: 256 +beta: 3.0 +buffer_size: 10000000 +device: cuda +discount: 0.99 +env: hammer-expert-v1 +eval_freq: 5000 +group: iql-adroit-hammer-expert-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: IQL +normalize: true +normalize_reward: false +qf_lr: 3e-4 +project: CORL +seed: 0 +tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/hammer/human_v1.yaml b/configs/offline/iql/hammer/human_v1.yaml new file mode 100644 index 00000000..a5eb1178 --- /dev/null +++ b/configs/offline/iql/hammer/human_v1.yaml @@ -0,0 +1,23 @@ +actor_lr: 3e-4 +actor_dropout: 0.1 +batch_size: 256 +beta: 3.0 +buffer_size: 10000000 +device: cuda +discount: 0.99 +env: hammer-human-v1 +eval_freq: 5000 +group: iql-adroit-hammer-human-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: IQL +normalize: true +normalize_reward: false +qf_lr: 3e-4 +project: CORL +seed: 0 +tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/hopper/expert_v2.yaml b/configs/offline/iql/hopper/expert_v2.yaml index 93bc3b47..f63990f8 100644 --- a/configs/offline/iql/hopper/expert_v2.yaml +++ b/configs/offline/iql/hopper/expert_v2.yaml @@ -1,3 +1,4 @@ +actor_lr: 3e-4 batch_size: 256 beta: 3.0 buffer_size: 10000000 @@ -15,6 +16,8 @@ n_episodes: 10 name: IQL normalize: true normalize_reward: false +qf_lr: 3e-4 project: CORL seed: 0 tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/hopper/full_replay_v2.yaml b/configs/offline/iql/hopper/full_replay_v2.yaml index 619b3544..afc60d24 100644 --- a/configs/offline/iql/hopper/full_replay_v2.yaml +++ b/configs/offline/iql/hopper/full_replay_v2.yaml @@ -1,3 +1,4 @@ +actor_lr: 3e-4 batch_size: 256 beta: 3.0 buffer_size: 10000000 @@ -15,6 +16,8 @@ n_episodes: 10 name: IQL normalize: true normalize_reward: false +qf_lr: 3e-4 project: CORL seed: 0 tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/hopper/medium_expert_v2.yaml b/configs/offline/iql/hopper/medium_expert_v2.yaml index bc7a1352..92594fa3 100644 --- a/configs/offline/iql/hopper/medium_expert_v2.yaml +++ b/configs/offline/iql/hopper/medium_expert_v2.yaml @@ -1,3 +1,4 @@ +actor_lr: 3e-4 batch_size: 256 beta: 6.0 buffer_size: 10000000 @@ -16,6 +17,8 @@ name: IQL normalize: true normalize_reward: false project: CORL +qf_lr: 3e-4 seed: 0 tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/hopper/medium_replay_v2.yaml b/configs/offline/iql/hopper/medium_replay_v2.yaml index a697c6c8..fe247984 100644 --- a/configs/offline/iql/hopper/medium_replay_v2.yaml +++ b/configs/offline/iql/hopper/medium_replay_v2.yaml @@ -1,3 +1,4 @@ +actor_lr: 3e-4 batch_size: 256 beta: 3.0 buffer_size: 10000000 @@ -16,6 +17,8 @@ name: IQL normalize: true normalize_reward: true project: CORL +qf_lr: 3e-4 seed: 0 tau: 0.001 +vf_lr: 3e-4 diff --git a/configs/offline/iql/hopper/medium_v2.yaml b/configs/offline/iql/hopper/medium_v2.yaml index 72cddf19..f1887939 100644 --- a/configs/offline/iql/hopper/medium_v2.yaml +++ b/configs/offline/iql/hopper/medium_v2.yaml @@ -1,3 +1,4 @@ +actor_lr: 3e-4 batch_size: 256 beta: 3.0 buffer_size: 10000000 @@ -16,6 +17,8 @@ name: IQL normalize: true normalize_reward: true project: CORL +qf_lr: 3e-4 seed: 0 tau: 0.001 +vf_lr: 3e-4 diff --git a/configs/offline/iql/hopper/random_v2.yaml b/configs/offline/iql/hopper/random_v2.yaml index de1cd069..4ee96de2 100644 --- a/configs/offline/iql/hopper/random_v2.yaml +++ b/configs/offline/iql/hopper/random_v2.yaml @@ -1,3 +1,4 @@ +actor_lr: 3e-4 batch_size: 256 beta: 3.0 buffer_size: 10000000 @@ -15,6 +16,8 @@ n_episodes: 10 name: IQL normalize: true normalize_reward: false +qf_lr: 3e-4 project: CORL seed: 0 tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/maze2d/large_dense_v1.yaml b/configs/offline/iql/maze2d/large_dense_v1.yaml index adab7c8b..e26451f3 100644 --- a/configs/offline/iql/maze2d/large_dense_v1.yaml +++ b/configs/offline/iql/maze2d/large_dense_v1.yaml @@ -1,3 +1,4 @@ +actor_lr: 3e-4 batch_size: 256 beta: 3.0 buffer_size: 10000000 @@ -15,6 +16,8 @@ n_episodes: 100 name: IQL normalize: true normalize_reward: false +qf_lr: 3e-4 project: CORL seed: 0 tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/maze2d/large_v1.yaml b/configs/offline/iql/maze2d/large_v1.yaml index 99392c5f..06d18c5e 100644 --- a/configs/offline/iql/maze2d/large_v1.yaml +++ b/configs/offline/iql/maze2d/large_v1.yaml @@ -1,3 +1,4 @@ +actor_lr: 3e-4 batch_size: 256 beta: 3.0 buffer_size: 10000000 @@ -16,6 +17,8 @@ name: IQL normalize: true normalize_reward: false project: CORL +qf_lr: 3e-4 seed: 0 tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/maze2d/medium_dense_v1.yaml b/configs/offline/iql/maze2d/medium_dense_v1.yaml index a0cdfec2..16b27f50 100644 --- a/configs/offline/iql/maze2d/medium_dense_v1.yaml +++ b/configs/offline/iql/maze2d/medium_dense_v1.yaml @@ -1,3 +1,4 @@ +actor_lr: 3e-4 batch_size: 256 beta: 3.0 buffer_size: 10000000 @@ -15,6 +16,8 @@ n_episodes: 100 name: IQL normalize: true normalize_reward: false +qf_lr: 3e-4 project: CORL seed: 0 tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/maze2d/medium_v1.yaml b/configs/offline/iql/maze2d/medium_v1.yaml index cd83dd62..ce47cd12 100644 --- a/configs/offline/iql/maze2d/medium_v1.yaml +++ b/configs/offline/iql/maze2d/medium_v1.yaml @@ -1,3 +1,4 @@ +actor_lr: 3e-4 batch_size: 256 beta: 3.0 buffer_size: 10000000 @@ -16,6 +17,8 @@ name: IQL normalize: true normalize_reward: false project: CORL +qf_lr: 3e-4 seed: 0 tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/maze2d/umaze_dense_v1.yaml b/configs/offline/iql/maze2d/umaze_dense_v1.yaml index d65a1c69..43047a32 100644 --- a/configs/offline/iql/maze2d/umaze_dense_v1.yaml +++ b/configs/offline/iql/maze2d/umaze_dense_v1.yaml @@ -1,3 +1,4 @@ +actor_lr: 3e-4 batch_size: 256 beta: 3.0 buffer_size: 10000000 @@ -15,6 +16,8 @@ n_episodes: 100 name: IQL normalize: true normalize_reward: false +qf_lr: 3e-4 project: CORL seed: 0 tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/maze2d/umaze_v1.yaml b/configs/offline/iql/maze2d/umaze_v1.yaml index c67a227c..a694ddd8 100644 --- a/configs/offline/iql/maze2d/umaze_v1.yaml +++ b/configs/offline/iql/maze2d/umaze_v1.yaml @@ -1,3 +1,4 @@ +actor_lr: 3e-4 batch_size: 256 beta: 3.0 buffer_size: 10000000 @@ -16,6 +17,8 @@ name: IQL normalize: true normalize_reward: false project: CORL +qf_lr: 3e-4 seed: 0 tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/pen/cloned_v1.yaml b/configs/offline/iql/pen/cloned_v1.yaml new file mode 100644 index 00000000..9c623da3 --- /dev/null +++ b/configs/offline/iql/pen/cloned_v1.yaml @@ -0,0 +1,23 @@ +actor_lr: 3e-4 +actor_dropout: 0.1 +batch_size: 256 +beta: 3.0 +buffer_size: 10000000 +device: cuda +discount: 0.99 +env: pen-cloned-v1 +eval_freq: 5000 +group: iql-adroit-pen-cloned-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: IQL +normalize: true +normalize_reward: false +qf_lr: 3e-4 +project: CORL +seed: 0 +tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/pen/expert_v1.yaml b/configs/offline/iql/pen/expert_v1.yaml new file mode 100644 index 00000000..8a9f2e4f --- /dev/null +++ b/configs/offline/iql/pen/expert_v1.yaml @@ -0,0 +1,23 @@ +actor_lr: 3e-4 +actor_dropout: 0.1 +batch_size: 256 +beta: 3.0 +buffer_size: 10000000 +device: cuda +discount: 0.99 +env: pen-expert-v1 +eval_freq: 5000 +group: iql-adroit-pen-expert-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: IQL +normalize: true +normalize_reward: false +qf_lr: 3e-4 +project: CORL +seed: 0 +tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/pen/human_v1.yaml b/configs/offline/iql/pen/human_v1.yaml new file mode 100644 index 00000000..5f33c109 --- /dev/null +++ b/configs/offline/iql/pen/human_v1.yaml @@ -0,0 +1,23 @@ +actor_lr: 3e-4 +actor_dropout: 0.1 +batch_size: 256 +beta: 3.0 +buffer_size: 10000000 +device: cuda +discount: 0.99 +env: pen-human-v1 +eval_freq: 5000 +group: iql-adroit-pen-human-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: IQL +normalize: true +normalize_reward: false +qf_lr: 3e-4 +project: CORL +seed: 0 +tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/relocate/cloned_v1.yaml b/configs/offline/iql/relocate/cloned_v1.yaml new file mode 100644 index 00000000..d9ad2bf7 --- /dev/null +++ b/configs/offline/iql/relocate/cloned_v1.yaml @@ -0,0 +1,23 @@ +actor_lr: 3e-4 +actor_dropout: 0.1 +batch_size: 256 +beta: 3.0 +buffer_size: 10000000 +device: cuda +discount: 0.99 +env: relocate-cloned-v1 +eval_freq: 5000 +group: iql-adroit-relocate-cloned-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: IQL +normalize: true +normalize_reward: false +qf_lr: 3e-4 +project: CORL +seed: 0 +tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/relocate/expert_v1.yaml b/configs/offline/iql/relocate/expert_v1.yaml new file mode 100644 index 00000000..797e624f --- /dev/null +++ b/configs/offline/iql/relocate/expert_v1.yaml @@ -0,0 +1,23 @@ +actor_lr: 3e-4 +actor_dropout: 0.1 +batch_size: 256 +beta: 3.0 +buffer_size: 10000000 +device: cuda +discount: 0.99 +env: relocate-expert-v1 +eval_freq: 5000 +group: iql-adroit-relocate-expert-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: IQL +normalize: true +normalize_reward: false +qf_lr: 3e-4 +project: CORL +seed: 0 +tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/relocate/human_v1.yaml b/configs/offline/iql/relocate/human_v1.yaml new file mode 100644 index 00000000..ba410a79 --- /dev/null +++ b/configs/offline/iql/relocate/human_v1.yaml @@ -0,0 +1,23 @@ +actor_lr: 3e-4 +actor_dropout: 0.1 +batch_size: 256 +beta: 3.0 +buffer_size: 10000000 +device: cuda +discount: 0.99 +env: relocate-human-v1 +eval_freq: 5000 +group: iql-adroit-relocate-human-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: IQL +normalize: true +normalize_reward: false +qf_lr: 3e-4 +project: CORL +seed: 0 +tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/walker2d/expert_v2.yaml b/configs/offline/iql/walker2d/expert_v2.yaml index 9e54f157..e9b83e29 100644 --- a/configs/offline/iql/walker2d/expert_v2.yaml +++ b/configs/offline/iql/walker2d/expert_v2.yaml @@ -1,3 +1,4 @@ +actor_lr: 3e-4 batch_size: 256 beta: 3.0 buffer_size: 10000000 @@ -15,6 +16,8 @@ n_episodes: 10 name: IQL normalize: true normalize_reward: false +qf_lr: 3e-4 project: CORL seed: 0 tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/walker2d/full_replay_v2.yaml b/configs/offline/iql/walker2d/full_replay_v2.yaml index 96bcb386..af77643c 100644 --- a/configs/offline/iql/walker2d/full_replay_v2.yaml +++ b/configs/offline/iql/walker2d/full_replay_v2.yaml @@ -1,3 +1,4 @@ +actor_lr: 3e-4 batch_size: 256 beta: 3.0 buffer_size: 10000000 @@ -15,6 +16,8 @@ n_episodes: 10 name: IQL normalize: true normalize_reward: false +qf_lr: 3e-4 project: CORL seed: 0 tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/walker2d/medium_expert_v2.yaml b/configs/offline/iql/walker2d/medium_expert_v2.yaml index 26f5533c..b8648fcf 100644 --- a/configs/offline/iql/walker2d/medium_expert_v2.yaml +++ b/configs/offline/iql/walker2d/medium_expert_v2.yaml @@ -1,3 +1,4 @@ +actor_lr: 3e-4 batch_size: 256 beta: 3.0 buffer_size: 10000000 @@ -16,6 +17,8 @@ name: IQL normalize: true normalize_reward: false project: CORL +qf_lr: 3e-4 seed: 0 tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/walker2d/medium_replay_v2.yaml b/configs/offline/iql/walker2d/medium_replay_v2.yaml index e4e6d6d6..3934c550 100644 --- a/configs/offline/iql/walker2d/medium_replay_v2.yaml +++ b/configs/offline/iql/walker2d/medium_replay_v2.yaml @@ -1,3 +1,4 @@ +actor_lr: 3e-4 batch_size: 256 beta: 3.0 buffer_size: 10000000 @@ -16,6 +17,8 @@ name: IQL normalize: true normalize_reward: false project: CORL +qf_lr: 3e-4 seed: 0 tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/walker2d/medium_v2.yaml b/configs/offline/iql/walker2d/medium_v2.yaml index a689b768..c6cdb73d 100644 --- a/configs/offline/iql/walker2d/medium_v2.yaml +++ b/configs/offline/iql/walker2d/medium_v2.yaml @@ -1,3 +1,4 @@ +actor_lr: 3e-4 batch_size: 256 beta: 3.0 buffer_size: 10000000 @@ -16,6 +17,8 @@ name: IQL normalize: true normalize_reward: false project: CORL +qf_lr: 3e-4 seed: 0 tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/iql/walker2d/random_v2.yaml b/configs/offline/iql/walker2d/random_v2.yaml index 5fe65fd1..659fce46 100644 --- a/configs/offline/iql/walker2d/random_v2.yaml +++ b/configs/offline/iql/walker2d/random_v2.yaml @@ -1,3 +1,4 @@ +actor_lr: 3e-4 batch_size: 256 beta: 3.0 buffer_size: 10000000 @@ -15,6 +16,8 @@ n_episodes: 10 name: IQL normalize: true normalize_reward: false +qf_lr: 3e-4 project: CORL seed: 0 tau: 0.005 +vf_lr: 3e-4 diff --git a/configs/offline/rebrac/antmaze/large_diverse_v2.yaml b/configs/offline/rebrac/antmaze/large_diverse_v2.yaml new file mode 100644 index 00000000..71ab1a62 --- /dev/null +++ b/configs/offline/rebrac/antmaze/large_diverse_v2.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.002 +actor_learning_rate: 0.0003 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 256 +critic_bc_coef: 0.002 +critic_learning_rate: 0.00005 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: antmaze-large-diverse-v2 +eval_episodes: 100 +eval_every: 50 +eval_seed: 42 +gamma: 0.999 +group: rebrac-antmaze-large-diverse-v2 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: true +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/antmaze/large_play_v2.yaml b/configs/offline/rebrac/antmaze/large_play_v2.yaml new file mode 100644 index 00000000..8d405a0e --- /dev/null +++ b/configs/offline/rebrac/antmaze/large_play_v2.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.002 +actor_learning_rate: 0.0003 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 256 +critic_bc_coef: 0.001 +critic_learning_rate: 0.00005 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: antmaze-large-play-v2 +eval_episodes: 100 +eval_every: 50 +eval_seed: 42 +gamma: 0.999 +group: rebrac-antmaze-large-play-v2 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: true +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/antmaze/medium_diverse_v2.yaml b/configs/offline/rebrac/antmaze/medium_diverse_v2.yaml new file mode 100644 index 00000000..dbda7fba --- /dev/null +++ b/configs/offline/rebrac/antmaze/medium_diverse_v2.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.001 +actor_learning_rate: 0.0003 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 256 +critic_bc_coef: 0.0 +critic_learning_rate: 0.00005 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: antmaze-medium-diverse-v2 +eval_episodes: 100 +eval_every: 50 +eval_seed: 42 +gamma: 0.999 +group: rebrac-antmaze-medium-diverse-v2 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: true +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/antmaze/medium_play_v2.yaml b/configs/offline/rebrac/antmaze/medium_play_v2.yaml new file mode 100644 index 00000000..39fd72c9 --- /dev/null +++ b/configs/offline/rebrac/antmaze/medium_play_v2.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.001 +actor_learning_rate: 0.0003 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 256 +critic_bc_coef: 0.0005 +critic_learning_rate: 0.00005 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: antmaze-medium-play-v2 +eval_episodes: 100 +eval_every: 50 +eval_seed: 42 +gamma: 0.999 +group: rebrac-antmaze-medium-play-v2 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: true +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/antmaze/umaze_diverse_v2.yaml b/configs/offline/rebrac/antmaze/umaze_diverse_v2.yaml new file mode 100644 index 00000000..ed0b627c --- /dev/null +++ b/configs/offline/rebrac/antmaze/umaze_diverse_v2.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.003 +actor_learning_rate: 0.0003 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 256 +critic_bc_coef: 0.001 +critic_learning_rate: 0.00005 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: antmaze-umaze-diverse-v2 +eval_episodes: 100 +eval_every: 50 +eval_seed: 42 +gamma: 0.999 +group: rebrac-antmaze-umaze-diverse-v2 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: true +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/antmaze/umaze_v2.yaml b/configs/offline/rebrac/antmaze/umaze_v2.yaml new file mode 100644 index 00000000..7b85ff5c --- /dev/null +++ b/configs/offline/rebrac/antmaze/umaze_v2.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.003 +actor_learning_rate: 0.0003 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 256 +critic_bc_coef: 0.002 +critic_learning_rate: 0.00005 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: antmaze-umaze-v2 +eval_episodes: 100 +eval_every: 50 +eval_seed: 42 +gamma: 0.999 +group: rebrac-antmaze-umaze-v2 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: true +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/door/cloned_v1.yaml b/configs/offline/rebrac/door/cloned_v1.yaml new file mode 100644 index 00000000..229efe9f --- /dev/null +++ b/configs/offline/rebrac/door/cloned_v1.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.01 +actor_learning_rate: 0.0003 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 256 +critic_bc_coef: 0.1 +critic_learning_rate: 0.0003 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: door-cloned-v1 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-door-cloned-v1 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/door/expert_v1.yaml b/configs/offline/rebrac/door/expert_v1.yaml new file mode 100644 index 00000000..ccb7e266 --- /dev/null +++ b/configs/offline/rebrac/door/expert_v1.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.05 +actor_learning_rate: 0.0003 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 256 +critic_bc_coef: 0.01 +critic_learning_rate: 0.0003 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: door-expert-v1 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-door-expert-v1 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/door/human_v1.yaml b/configs/offline/rebrac/door/human_v1.yaml new file mode 100644 index 00000000..6bbf0fa0 --- /dev/null +++ b/configs/offline/rebrac/door/human_v1.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.1 +actor_learning_rate: 0.0003 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 256 +critic_bc_coef: 0.1 +critic_learning_rate: 0.0003 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: door-human-v1 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-door-human-v1 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/halfcheetah/expert_v2.yaml b/configs/offline/rebrac/halfcheetah/expert_v2.yaml new file mode 100644 index 00000000..6529b787 --- /dev/null +++ b/configs/offline/rebrac/halfcheetah/expert_v2.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.01 +actor_learning_rate: 0.001 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 1024 +critic_bc_coef: 0.01 +critic_learning_rate: 0.001 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: halfcheetah-expert-v2 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-halfcheetah-expert-v2 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/halfcheetah/full_replay_v2.yaml b/configs/offline/rebrac/halfcheetah/full_replay_v2.yaml new file mode 100644 index 00000000..353ce9f1 --- /dev/null +++ b/configs/offline/rebrac/halfcheetah/full_replay_v2.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.001 +actor_learning_rate: 0.001 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 1024 +critic_bc_coef: 0.1 +critic_learning_rate: 0.001 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: halfcheetah-full-replay-v2 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-halfcheetah-full-replay-v2 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/halfcheetah/medium_expert_v2.yaml b/configs/offline/rebrac/halfcheetah/medium_expert_v2.yaml new file mode 100644 index 00000000..3a3b8993 --- /dev/null +++ b/configs/offline/rebrac/halfcheetah/medium_expert_v2.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.01 +actor_learning_rate: 0.001 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 1024 +critic_bc_coef: 0.1 +critic_learning_rate: 0.001 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: halfcheetah-medium-expert-v2 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-halfcheetah-medium-expert-v2 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/halfcheetah/medium_replay_v2.yaml b/configs/offline/rebrac/halfcheetah/medium_replay_v2.yaml new file mode 100644 index 00000000..8600d7a2 --- /dev/null +++ b/configs/offline/rebrac/halfcheetah/medium_replay_v2.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.01 +actor_learning_rate: 0.001 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 1024 +critic_bc_coef: 0.001 +critic_learning_rate: 0.001 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: halfcheetah-medium-replay-v2 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-halfcheetah-medium-replay-v2 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/halfcheetah/medium_v2.yaml b/configs/offline/rebrac/halfcheetah/medium_v2.yaml new file mode 100644 index 00000000..69ccca0c --- /dev/null +++ b/configs/offline/rebrac/halfcheetah/medium_v2.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.001 +actor_learning_rate: 0.001 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 1024 +critic_bc_coef: 0.01 +critic_learning_rate: 0.001 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: halfcheetah-medium-v2 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-halfcheetah-medium-v2 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/halfcheetah/random_v2.yaml b/configs/offline/rebrac/halfcheetah/random_v2.yaml new file mode 100644 index 00000000..7d8fa36e --- /dev/null +++ b/configs/offline/rebrac/halfcheetah/random_v2.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.001 +actor_learning_rate: 0.001 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 1024 +critic_bc_coef: 0.1 +critic_learning_rate: 0.001 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: halfcheetah-random-v2 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-halfcheetah-random-v2 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/hammer/cloned_v1.yaml b/configs/offline/rebrac/hammer/cloned_v1.yaml new file mode 100644 index 00000000..35f750c6 --- /dev/null +++ b/configs/offline/rebrac/hammer/cloned_v1.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.1 +actor_learning_rate: 0.0003 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 256 +critic_bc_coef: 0.5 +critic_learning_rate: 0.0003 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: hammer-cloned-v1 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-hammer-cloned-v1 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/hammer/expert_v1.yaml b/configs/offline/rebrac/hammer/expert_v1.yaml new file mode 100644 index 00000000..651f52dd --- /dev/null +++ b/configs/offline/rebrac/hammer/expert_v1.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.01 +actor_learning_rate: 0.0003 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 256 +critic_bc_coef: 0.01 +critic_learning_rate: 0.0003 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: hammer-expert-v1 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-hammer-expert-v1 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/hammer/human_v1.yaml b/configs/offline/rebrac/hammer/human_v1.yaml new file mode 100644 index 00000000..a81dc1ac --- /dev/null +++ b/configs/offline/rebrac/hammer/human_v1.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.01 +actor_learning_rate: 0.0003 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 256 +critic_bc_coef: 0.5 +critic_learning_rate: 0.0003 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: hammer-human-v1 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-hammer-human-v1 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/hopper/expert_v2.yaml b/configs/offline/rebrac/hopper/expert_v2.yaml new file mode 100644 index 00000000..3c0f41be --- /dev/null +++ b/configs/offline/rebrac/hopper/expert_v2.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.1 +actor_learning_rate: 0.001 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 1024 +critic_bc_coef: 0.1 +critic_learning_rate: 0.001 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: hopper-expert-v2 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-hopper-expert-v2 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/hopper/full_replay_v2.yaml b/configs/offline/rebrac/hopper/full_replay_v2.yaml new file mode 100644 index 00000000..cb1ade97 --- /dev/null +++ b/configs/offline/rebrac/hopper/full_replay_v2.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.01 +actor_learning_rate: 0.001 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 1024 +critic_bc_coef: 0.01 +critic_learning_rate: 0.001 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: hopper-full-replay-v2 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-hopper-full-replay-v2 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/hopper/medium_expert_v2.yaml b/configs/offline/rebrac/hopper/medium_expert_v2.yaml new file mode 100644 index 00000000..f55b56b6 --- /dev/null +++ b/configs/offline/rebrac/hopper/medium_expert_v2.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.1 +actor_learning_rate: 0.001 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 1024 +critic_bc_coef: 0.01 +critic_learning_rate: 0.001 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: hopper-medium-expert-v2 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-hopper-medium-expert-v2 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/hopper/medium_replay_v2.yaml b/configs/offline/rebrac/hopper/medium_replay_v2.yaml new file mode 100644 index 00000000..c74253d1 --- /dev/null +++ b/configs/offline/rebrac/hopper/medium_replay_v2.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.05 +actor_learning_rate: 0.001 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 1024 +critic_bc_coef: 0.5 +critic_learning_rate: 0.001 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: hopper-medium-replay-v2 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-hopper-medium-replay-v2 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/hopper/medium_v2.yaml b/configs/offline/rebrac/hopper/medium_v2.yaml new file mode 100644 index 00000000..ba59fbf6 --- /dev/null +++ b/configs/offline/rebrac/hopper/medium_v2.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.01 +actor_learning_rate: 0.001 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 1024 +critic_bc_coef: 0.01 +critic_learning_rate: 0.001 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: hopper-medium-v2 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-hopper-medium-v2 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/hopper/random_v2.yaml b/configs/offline/rebrac/hopper/random_v2.yaml new file mode 100644 index 00000000..9e24d024 --- /dev/null +++ b/configs/offline/rebrac/hopper/random_v2.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.001 +actor_learning_rate: 0.001 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 1024 +critic_bc_coef: 0.01 +critic_learning_rate: 0.001 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: hopper-random-v2 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-hopper-random-v2 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/maze2d/large_v1.yaml b/configs/offline/rebrac/maze2d/large_v1.yaml new file mode 100644 index 00000000..16d8b5de --- /dev/null +++ b/configs/offline/rebrac/maze2d/large_v1.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.003 +actor_learning_rate: 0.0003 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 256 +critic_bc_coef: 0.001 +critic_learning_rate: 0.0003 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: maze2d-large-v1 +eval_episodes: 100 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-maze2d-large-v1 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/maze2d/medium_v1.yaml b/configs/offline/rebrac/maze2d/medium_v1.yaml new file mode 100644 index 00000000..06e3a7a1 --- /dev/null +++ b/configs/offline/rebrac/maze2d/medium_v1.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.003 +actor_learning_rate: 0.0003 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 256 +critic_bc_coef: 0.001 +critic_learning_rate: 0.0003 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: maze2d-medium-v1 +eval_episodes: 100 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-maze2d-medium-v1 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/maze2d/umaze_v1.yaml b/configs/offline/rebrac/maze2d/umaze_v1.yaml new file mode 100644 index 00000000..902f66d5 --- /dev/null +++ b/configs/offline/rebrac/maze2d/umaze_v1.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.003 +actor_learning_rate: 0.0003 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 256 +critic_bc_coef: 0.001 +critic_learning_rate: 0.0003 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: maze2d-umaze-v1 +eval_episodes: 100 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-maze2d-umaze-v1 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/pen/cloned_v1.yaml b/configs/offline/rebrac/pen/cloned_v1.yaml new file mode 100644 index 00000000..dc4f3d7a --- /dev/null +++ b/configs/offline/rebrac/pen/cloned_v1.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.05 +actor_learning_rate: 0.0003 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 256 +critic_bc_coef: 0.5 +critic_learning_rate: 0.0003 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: pen-cloned-v1 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-pen-cloned-v1 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/pen/expert_v1.yaml b/configs/offline/rebrac/pen/expert_v1.yaml new file mode 100644 index 00000000..08d49218 --- /dev/null +++ b/configs/offline/rebrac/pen/expert_v1.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.01 +actor_learning_rate: 0.0003 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 256 +critic_bc_coef: 0.01 +critic_learning_rate: 0.0003 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: pen-expert-v1 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-pen-expert-v1 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/pen/human_v1.yaml b/configs/offline/rebrac/pen/human_v1.yaml new file mode 100644 index 00000000..03b2b628 --- /dev/null +++ b/configs/offline/rebrac/pen/human_v1.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.1 +actor_learning_rate: 0.0003 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 256 +critic_bc_coef: 0.5 +critic_learning_rate: 0.0003 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: pen-human-v1 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-pen-human-v1 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/relocate/cloned_v1.yaml b/configs/offline/rebrac/relocate/cloned_v1.yaml new file mode 100644 index 00000000..3db2d28c --- /dev/null +++ b/configs/offline/rebrac/relocate/cloned_v1.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.1 +actor_learning_rate: 0.0003 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 256 +critic_bc_coef: 0.01 +critic_learning_rate: 0.0003 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: relocate-cloned-v1 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-relocate-cloned-v1 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/relocate/expert_v1.yaml b/configs/offline/rebrac/relocate/expert_v1.yaml new file mode 100644 index 00000000..bafa6911 --- /dev/null +++ b/configs/offline/rebrac/relocate/expert_v1.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.05 +actor_learning_rate: 0.0003 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 256 +critic_bc_coef: 0.01 +critic_learning_rate: 0.0003 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: relocate-expert-v1 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-relocate-expert-v1 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/relocate/human_v1.yaml b/configs/offline/rebrac/relocate/human_v1.yaml new file mode 100644 index 00000000..1944798a --- /dev/null +++ b/configs/offline/rebrac/relocate/human_v1.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.1 +actor_learning_rate: 0.0003 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 256 +critic_bc_coef: 0.01 +critic_learning_rate: 0.0003 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: relocate-human-v1 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-relocate-human-v1 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/walker2d/expert_v2.yaml b/configs/offline/rebrac/walker2d/expert_v2.yaml new file mode 100644 index 00000000..c727fa58 --- /dev/null +++ b/configs/offline/rebrac/walker2d/expert_v2.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.01 +actor_learning_rate: 0.001 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 1024 +critic_bc_coef: 0.5 +critic_learning_rate: 0.001 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: walker2d-expert-v2 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-walker2d-expert-v2 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/walker2d/full_replay_v2.yaml b/configs/offline/rebrac/walker2d/full_replay_v2.yaml new file mode 100644 index 00000000..e4af9291 --- /dev/null +++ b/configs/offline/rebrac/walker2d/full_replay_v2.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.01 +actor_learning_rate: 0.001 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 1024 +critic_bc_coef: 0.01 +critic_learning_rate: 0.001 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: walker2d-full-replay-v2 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-walker2d-full-replay-v2 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/walker2d/medium_expert_v2.yaml b/configs/offline/rebrac/walker2d/medium_expert_v2.yaml new file mode 100644 index 00000000..844f680d --- /dev/null +++ b/configs/offline/rebrac/walker2d/medium_expert_v2.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.01 +actor_learning_rate: 0.001 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 1024 +critic_bc_coef: 0.01 +critic_learning_rate: 0.001 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: walker2d-medium-expert-v2 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-walker2d-medium-expert-v2 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/walker2d/medium_replay_v2.yaml b/configs/offline/rebrac/walker2d/medium_replay_v2.yaml new file mode 100644 index 00000000..493ef8d8 --- /dev/null +++ b/configs/offline/rebrac/walker2d/medium_replay_v2.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.05 +actor_learning_rate: 0.001 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 1024 +critic_bc_coef: 0.01 +critic_learning_rate: 0.001 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: walker2d-medium-replay-v2 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-walker2d-medium-replay-v2 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/walker2d/medium_v2.yaml b/configs/offline/rebrac/walker2d/medium_v2.yaml new file mode 100644 index 00000000..3231960e --- /dev/null +++ b/configs/offline/rebrac/walker2d/medium_v2.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.05 +actor_learning_rate: 0.001 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 1024 +critic_bc_coef: 0.1 +critic_learning_rate: 0.001 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: walker2d-medium-v2 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-walker2d-medium-v2 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/rebrac/walker2d/random_v2.yaml b/configs/offline/rebrac/walker2d/random_v2.yaml new file mode 100644 index 00000000..e0b7475c --- /dev/null +++ b/configs/offline/rebrac/walker2d/random_v2.yaml @@ -0,0 +1,28 @@ +actor_bc_coef: 0.01 +actor_learning_rate: 0.001 +actor_ln: false +actor_n_hiddens: 3 +batch_size: 1024 +critic_bc_coef: 0.0 +critic_learning_rate: 0.001 +critic_ln: true +critic_n_hiddens: 3 +dataset_name: walker2d-random-v2 +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: rebrac-walker2d-random-v2 +hidden_dim: 256 +name: rebrac +noise_clip: 0.5 +normalize_q: true +normalize_reward: false +normalize_states: false +num_epochs: 1000 +num_updates_on_epoch: 1000 +policy_freq: 2 +policy_noise: 0.2 +project: ReBRAC +tau: 0.005 +train_seed: 0 diff --git a/configs/offline/sac_n/antmaze/large_diverse_v2.yaml b/configs/offline/sac_n/antmaze/large_diverse_v2.yaml new file mode 100644 index 00000000..dacbaf21 --- /dev/null +++ b/configs/offline/sac_n/antmaze/large_diverse_v2.yaml @@ -0,0 +1,25 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "antmaze-large-diverse-v2" +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "sac-n-antmaze-large-diverse-v2-multiseed-v0" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "SAC-N" +normalize_reward: false +num_critics: 25 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/sac_n/antmaze/large_play_v0.yaml b/configs/offline/sac_n/antmaze/large_play_v2.yaml similarity index 83% rename from configs/offline/sac_n/antmaze/large_play_v0.yaml rename to configs/offline/sac_n/antmaze/large_play_v2.yaml index 9602c41a..f9c0172b 100644 --- a/configs/offline/sac_n/antmaze/large_play_v0.yaml +++ b/configs/offline/sac_n/antmaze/large_play_v2.yaml @@ -6,12 +6,12 @@ checkpoints_path: null critic_learning_rate: 0.0003 deterministic_torch: false device: cuda -env_name: "antmaze-large-play-v0" +env_name: "antmaze-large-play-v2" eval_episodes: 10 eval_every: 5 eval_seed: 42 gamma: 0.99 -group: "sac-n-antmaze-large-play-v0-multiseed-v0" +group: "sac-n-antmaze-large-play-v2-multiseed-v0" hidden_dim: 256 log_every: 100 max_action: 1.0 diff --git a/configs/offline/sac_n/antmaze/medium_diverse_v2.yaml b/configs/offline/sac_n/antmaze/medium_diverse_v2.yaml new file mode 100644 index 00000000..de5fca1a --- /dev/null +++ b/configs/offline/sac_n/antmaze/medium_diverse_v2.yaml @@ -0,0 +1,25 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "antmaze-medium-diverse-v2" +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "sac-n-antmaze-medium-diverse-v2-multiseed-v0" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "SAC-N" +normalize_reward: false +num_critics: 25 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/sac_n/antmaze/medium_play_v0.yaml b/configs/offline/sac_n/antmaze/medium_play_v2.yaml similarity index 83% rename from configs/offline/sac_n/antmaze/medium_play_v0.yaml rename to configs/offline/sac_n/antmaze/medium_play_v2.yaml index c640d4d4..b100f05a 100644 --- a/configs/offline/sac_n/antmaze/medium_play_v0.yaml +++ b/configs/offline/sac_n/antmaze/medium_play_v2.yaml @@ -6,12 +6,12 @@ checkpoints_path: null critic_learning_rate: 0.0003 deterministic_torch: false device: cuda -env_name: "antmaze-medium-play-v0" +env_name: "antmaze-medium-play-v2" eval_episodes: 10 eval_every: 5 eval_seed: 42 gamma: 0.99 -group: "sac-n-antmaze-medium-play-v0-multiseed-v0" +group: "sac-n-antmaze-medium-play-v2-multiseed-v0" hidden_dim: 256 log_every: 100 max_action: 1.0 diff --git a/configs/offline/sac_n/antmaze/umaze_diverse_v2.yaml b/configs/offline/sac_n/antmaze/umaze_diverse_v2.yaml new file mode 100644 index 00000000..140c97c7 --- /dev/null +++ b/configs/offline/sac_n/antmaze/umaze_diverse_v2.yaml @@ -0,0 +1,25 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "antmaze-umaze-diverse-v2" +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "sac-n-antmaze-umaze-diverse-v2-multiseed-v0" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "SAC-N" +normalize_reward: false +num_critics: 25 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/sac_n/antmaze/umaze_v0.yaml b/configs/offline/sac_n/antmaze/umaze_v2.yaml similarity index 85% rename from configs/offline/sac_n/antmaze/umaze_v0.yaml rename to configs/offline/sac_n/antmaze/umaze_v2.yaml index 987852a7..119bd0bb 100644 --- a/configs/offline/sac_n/antmaze/umaze_v0.yaml +++ b/configs/offline/sac_n/antmaze/umaze_v2.yaml @@ -6,12 +6,12 @@ checkpoints_path: null critic_learning_rate: 0.0003 deterministic_torch: false device: cuda -env_name: "antmaze-umaze-v0" +env_name: "antmaze-umaze-v2" eval_episodes: 10 eval_every: 5 eval_seed: 42 gamma: 0.99 -group: "sac-n-antmaze-umaze-v0-multiseed-v0" +group: "sac-n-antmaze-umaze-v2-multiseed-v0" hidden_dim: 256 log_every: 100 max_action: 1.0 diff --git a/configs/offline/sac_n/door/cloned_v1.yaml b/configs/offline/sac_n/door/cloned_v1.yaml new file mode 100644 index 00000000..4afd5ba2 --- /dev/null +++ b/configs/offline/sac_n/door/cloned_v1.yaml @@ -0,0 +1,25 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 2000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "door-cloned-v1" +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "sac-n-door-cloned-v1-multiseed-v0" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "SAC-N" +normalize_reward: false +num_critics: 100 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/sac_n/door/expert_v1.yaml b/configs/offline/sac_n/door/expert_v1.yaml new file mode 100644 index 00000000..20383063 --- /dev/null +++ b/configs/offline/sac_n/door/expert_v1.yaml @@ -0,0 +1,25 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 2000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "door-expert-v1" +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "sac-n-door-expert-v1-multiseed-v0" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "SAC-N" +normalize_reward: false +num_critics: 100 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/sac_n/door/human_v1.yaml b/configs/offline/sac_n/door/human_v1.yaml new file mode 100644 index 00000000..1ac938ab --- /dev/null +++ b/configs/offline/sac_n/door/human_v1.yaml @@ -0,0 +1,25 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 2000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "door-human-v1" +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "sac-n-door-human-v1-multiseed-v0" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "SAC-N" +normalize_reward: false +num_critics: 100 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/sac_n/hammer/cloned_v1.yaml b/configs/offline/sac_n/hammer/cloned_v1.yaml new file mode 100644 index 00000000..d4953982 --- /dev/null +++ b/configs/offline/sac_n/hammer/cloned_v1.yaml @@ -0,0 +1,25 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 2000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "hammer-cloned-v1" +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "sac-n-hammer-cloned-v1-multiseed-v0" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "SAC-N" +normalize_reward: false +num_critics: 100 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/sac_n/hammer/expert_v1.yaml b/configs/offline/sac_n/hammer/expert_v1.yaml new file mode 100644 index 00000000..54dadb68 --- /dev/null +++ b/configs/offline/sac_n/hammer/expert_v1.yaml @@ -0,0 +1,25 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 2000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "hammer-expert-v1" +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "sac-n-hammer-expert-v1-multiseed-v0" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "SAC-N" +normalize_reward: false +num_critics: 100 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/sac_n/hammer/human_v1.yaml b/configs/offline/sac_n/hammer/human_v1.yaml new file mode 100644 index 00000000..93f6a9cb --- /dev/null +++ b/configs/offline/sac_n/hammer/human_v1.yaml @@ -0,0 +1,25 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 2000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "hammer-human-v1" +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "sac-n-hammer-human-v1-multiseed-v0" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "SAC-N" +normalize_reward: false +num_critics: 100 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/sac_n/pen/cloned_v1.yaml b/configs/offline/sac_n/pen/cloned_v1.yaml new file mode 100644 index 00000000..2d89bd49 --- /dev/null +++ b/configs/offline/sac_n/pen/cloned_v1.yaml @@ -0,0 +1,25 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 2000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "pen-cloned-v1" +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "sac-n-pen-cloned-v1-multiseed-v0" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "SAC-N" +normalize_reward: false +num_critics: 100 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/sac_n/pen/expert_v1.yaml b/configs/offline/sac_n/pen/expert_v1.yaml new file mode 100644 index 00000000..0f2010f5 --- /dev/null +++ b/configs/offline/sac_n/pen/expert_v1.yaml @@ -0,0 +1,25 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 2000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "pen-expert-v1" +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "sac-n-pen-expert-v1-multiseed-v0" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "SAC-N" +normalize_reward: false +num_critics: 100 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/sac_n/pen/human_v1.yaml b/configs/offline/sac_n/pen/human_v1.yaml new file mode 100644 index 00000000..34e0af95 --- /dev/null +++ b/configs/offline/sac_n/pen/human_v1.yaml @@ -0,0 +1,25 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 2000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "pen-human-v1" +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "sac-n-pen-human-v1-multiseed-v0" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "SAC-N" +normalize_reward: false +num_critics: 100 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/sac_n/relocate/cloned_v1.yaml b/configs/offline/sac_n/relocate/cloned_v1.yaml new file mode 100644 index 00000000..bcd2b292 --- /dev/null +++ b/configs/offline/sac_n/relocate/cloned_v1.yaml @@ -0,0 +1,25 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 2000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "relocate-cloned-v1" +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "sac-n-relocate-cloned-v1-multiseed-v0" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "SAC-N" +normalize_reward: false +num_critics: 100 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/sac_n/relocate/expert_v1.yaml b/configs/offline/sac_n/relocate/expert_v1.yaml new file mode 100644 index 00000000..ea4973e7 --- /dev/null +++ b/configs/offline/sac_n/relocate/expert_v1.yaml @@ -0,0 +1,25 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 2000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "relocate-expert-v1" +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "sac-n-relocate-expert-v1-multiseed-v0" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "SAC-N" +normalize_reward: false +num_critics: 100 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/sac_n/relocate/human_v1.yaml b/configs/offline/sac_n/relocate/human_v1.yaml new file mode 100644 index 00000000..29878381 --- /dev/null +++ b/configs/offline/sac_n/relocate/human_v1.yaml @@ -0,0 +1,25 @@ +actor_learning_rate: 0.0003 +alpha_learning_rate: 0.0003 +batch_size: 256 +buffer_size: 2000000 +checkpoints_path: null +critic_learning_rate: 0.0003 +deterministic_torch: false +device: cuda +env_name: "relocate-human-v1" +eval_episodes: 10 +eval_every: 5 +eval_seed: 42 +gamma: 0.99 +group: "sac-n-relocate-human-v1-multiseed-v0" +hidden_dim: 256 +log_every: 100 +max_action: 1.0 +name: "SAC-N" +normalize_reward: false +num_critics: 100 +num_epochs: 3000 +num_updates_on_epoch: 1000 +project: "CORL" +tau: 0.005 +train_seed: 10 \ No newline at end of file diff --git a/configs/offline/td3_bc/antmaze/large_diverse_v2.yaml b/configs/offline/td3_bc/antmaze/large_diverse_v2.yaml new file mode 100644 index 00000000..48f1b741 --- /dev/null +++ b/configs/offline/td3_bc/antmaze/large_diverse_v2.yaml @@ -0,0 +1,22 @@ +alpha: 2.5 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: antmaze-large-diverse-v2 +eval_freq: 5000 +expl_noise: 0.1 +group: td3-bc-antmaze-large-diverse-v2-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 100 +name: TD3-BC +noise_clip: 0.5 +normalize: true +normalize_reward: true +policy_freq: 2 +policy_noise: 0.2 +project: CORL +seed: 0 +tau: 0.005 diff --git a/configs/offline/td3_bc/antmaze/large_play_v0.yaml b/configs/offline/td3_bc/antmaze/large_play_v2.yaml similarity index 80% rename from configs/offline/td3_bc/antmaze/large_play_v0.yaml rename to configs/offline/td3_bc/antmaze/large_play_v2.yaml index ef085f7c..bea5af20 100644 --- a/configs/offline/td3_bc/antmaze/large_play_v0.yaml +++ b/configs/offline/td3_bc/antmaze/large_play_v2.yaml @@ -4,10 +4,10 @@ buffer_size: 10000000 checkpoints_path: null device: cuda discount: 0.99 -env: antmaze-large-play-v0 +env: antmaze-large-play-v2 eval_freq: 5000 expl_noise: 0.1 -group: td3-bc-antmaze-large-play-v0-multiseed-v0 +group: td3-bc-antmaze-large-play-v2-multiseed-v0 load_model: '' max_timesteps: 1000000 n_episodes: 100 diff --git a/configs/offline/td3_bc/antmaze/medium_diverse_v2.yaml b/configs/offline/td3_bc/antmaze/medium_diverse_v2.yaml new file mode 100644 index 00000000..78e15f99 --- /dev/null +++ b/configs/offline/td3_bc/antmaze/medium_diverse_v2.yaml @@ -0,0 +1,22 @@ +alpha: 2.5 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: antmaze-medium-diverse-v2 +eval_freq: 5000 +expl_noise: 0.1 +group: td3-bc-antmaze-medium-diverse-v2-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 100 +name: TD3-BC +noise_clip: 0.5 +normalize: true +normalize_reward: true +policy_freq: 2 +policy_noise: 0.2 +project: CORL +seed: 0 +tau: 0.005 diff --git a/configs/offline/td3_bc/antmaze/medium_play_v0.yaml b/configs/offline/td3_bc/antmaze/medium_play_v2.yaml similarity index 80% rename from configs/offline/td3_bc/antmaze/medium_play_v0.yaml rename to configs/offline/td3_bc/antmaze/medium_play_v2.yaml index 82be57fb..2621921d 100644 --- a/configs/offline/td3_bc/antmaze/medium_play_v0.yaml +++ b/configs/offline/td3_bc/antmaze/medium_play_v2.yaml @@ -4,10 +4,10 @@ buffer_size: 10000000 checkpoints_path: null device: cuda discount: 0.99 -env: antmaze-medium-play-v0 +env: antmaze-medium-play-v2 eval_freq: 5000 expl_noise: 0.1 -group: td3-bc-antmaze-medium-play-v0-multiseed-v0 +group: td3-bc-antmaze-medium-play-v2-multiseed-v0 load_model: '' max_timesteps: 1000000 n_episodes: 100 diff --git a/configs/offline/td3_bc/antmaze/umaze_diverse_v2.yaml b/configs/offline/td3_bc/antmaze/umaze_diverse_v2.yaml new file mode 100644 index 00000000..0ce2303a --- /dev/null +++ b/configs/offline/td3_bc/antmaze/umaze_diverse_v2.yaml @@ -0,0 +1,22 @@ +alpha: 2.5 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: antmaze-umaze-diverse-v2 +eval_freq: 5000 +expl_noise: 0.1 +group: td3-bc-antmaze-umaze-diverse-v2-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 100 +name: TD3-BC +noise_clip: 0.5 +normalize: true +normalize_reward: true +policy_freq: 2 +policy_noise: 0.2 +project: CORL +seed: 0 +tau: 0.005 diff --git a/configs/offline/td3_bc/antmaze/umaze_v0.yaml b/configs/offline/td3_bc/antmaze/umaze_v2.yaml similarity index 82% rename from configs/offline/td3_bc/antmaze/umaze_v0.yaml rename to configs/offline/td3_bc/antmaze/umaze_v2.yaml index 3ad23962..206adfd6 100644 --- a/configs/offline/td3_bc/antmaze/umaze_v0.yaml +++ b/configs/offline/td3_bc/antmaze/umaze_v2.yaml @@ -4,10 +4,10 @@ buffer_size: 10000000 checkpoints_path: null device: cuda discount: 0.99 -env: antmaze-umaze-v0 +env: antmaze-umaze-v2 eval_freq: 5000 expl_noise: 0.1 -group: td3-bc-antmaze-umaze-v0-multiseed-v0 +group: td3-bc-antmaze-umaze-v2-multiseed-v0 load_model: '' max_timesteps: 1000000 n_episodes: 100 diff --git a/configs/offline/td3_bc/door/cloned_v1.yaml b/configs/offline/td3_bc/door/cloned_v1.yaml new file mode 100644 index 00000000..baabd1bc --- /dev/null +++ b/configs/offline/td3_bc/door/cloned_v1.yaml @@ -0,0 +1,22 @@ +alpha: 2.5 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: door-cloned-v1 +eval_freq: 5000 +expl_noise: 0.1 +group: td3-bc-adroit-door-cloned-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: TD3-BC +noise_clip: 0.5 +normalize: true +normalize_reward: false +policy_freq: 2 +policy_noise: 0.2 +project: CORL +seed: 0 +tau: 0.005 diff --git a/configs/offline/td3_bc/door/expert_v1.yaml b/configs/offline/td3_bc/door/expert_v1.yaml new file mode 100644 index 00000000..11868612 --- /dev/null +++ b/configs/offline/td3_bc/door/expert_v1.yaml @@ -0,0 +1,22 @@ +alpha: 2.5 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: door-expert-v1 +eval_freq: 5000 +expl_noise: 0.1 +group: td3-bc-adroit-door-expert-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: TD3-BC +noise_clip: 0.5 +normalize: true +normalize_reward: false +policy_freq: 2 +policy_noise: 0.2 +project: CORL +seed: 0 +tau: 0.005 diff --git a/configs/offline/td3_bc/door/human_v1.yaml b/configs/offline/td3_bc/door/human_v1.yaml new file mode 100644 index 00000000..4384b2b5 --- /dev/null +++ b/configs/offline/td3_bc/door/human_v1.yaml @@ -0,0 +1,22 @@ +alpha: 2.5 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: door-human-v1 +eval_freq: 5000 +expl_noise: 0.1 +group: td3-bc-adroit-door-human-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: TD3-BC +noise_clip: 0.5 +normalize: true +normalize_reward: false +policy_freq: 2 +policy_noise: 0.2 +project: CORL +seed: 0 +tau: 0.005 diff --git a/configs/offline/td3_bc/hammer/cloned_v1.yaml b/configs/offline/td3_bc/hammer/cloned_v1.yaml new file mode 100644 index 00000000..3d540c23 --- /dev/null +++ b/configs/offline/td3_bc/hammer/cloned_v1.yaml @@ -0,0 +1,22 @@ +alpha: 2.5 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: hammer-cloned-v1 +eval_freq: 5000 +expl_noise: 0.1 +group: td3-bc-adroit-hammer-cloned-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: TD3-BC +noise_clip: 0.5 +normalize: true +normalize_reward: false +policy_freq: 2 +policy_noise: 0.2 +project: CORL +seed: 0 +tau: 0.005 diff --git a/configs/offline/td3_bc/hammer/expert_v1.yaml b/configs/offline/td3_bc/hammer/expert_v1.yaml new file mode 100644 index 00000000..8022e31f --- /dev/null +++ b/configs/offline/td3_bc/hammer/expert_v1.yaml @@ -0,0 +1,22 @@ +alpha: 2.5 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: hammer-expert-v1 +eval_freq: 5000 +expl_noise: 0.1 +group: td3-bc-adroit-hammer-expert-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: TD3-BC +noise_clip: 0.5 +normalize: true +normalize_reward: false +policy_freq: 2 +policy_noise: 0.2 +project: CORL +seed: 0 +tau: 0.005 diff --git a/configs/offline/td3_bc/hammer/human_v1.yaml b/configs/offline/td3_bc/hammer/human_v1.yaml new file mode 100644 index 00000000..988bf8e2 --- /dev/null +++ b/configs/offline/td3_bc/hammer/human_v1.yaml @@ -0,0 +1,22 @@ +alpha: 2.5 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: hammer-human-v1 +eval_freq: 5000 +expl_noise: 0.1 +group: td3-bc-adroit-hammer-human-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: TD3-BC +noise_clip: 0.5 +normalize: true +normalize_reward: false +policy_freq: 2 +policy_noise: 0.2 +project: CORL +seed: 0 +tau: 0.005 diff --git a/configs/offline/td3_bc/pen/cloned_v1.yaml b/configs/offline/td3_bc/pen/cloned_v1.yaml new file mode 100644 index 00000000..8474ac63 --- /dev/null +++ b/configs/offline/td3_bc/pen/cloned_v1.yaml @@ -0,0 +1,22 @@ +alpha: 2.5 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: pen-cloned-v1 +eval_freq: 5000 +expl_noise: 0.1 +group: td3-bc-adroit-pen-cloned-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: TD3-BC +noise_clip: 0.5 +normalize: true +normalize_reward: false +policy_freq: 2 +policy_noise: 0.2 +project: CORL +seed: 0 +tau: 0.005 diff --git a/configs/offline/td3_bc/pen/expert_v1.yaml b/configs/offline/td3_bc/pen/expert_v1.yaml new file mode 100644 index 00000000..b9426eb6 --- /dev/null +++ b/configs/offline/td3_bc/pen/expert_v1.yaml @@ -0,0 +1,22 @@ +alpha: 2.5 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: pen-expert-v1 +eval_freq: 5000 +expl_noise: 0.1 +group: td3-bc-adroit-pen-expert-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: TD3-BC +noise_clip: 0.5 +normalize: true +normalize_reward: false +policy_freq: 2 +policy_noise: 0.2 +project: CORL +seed: 0 +tau: 0.005 diff --git a/configs/offline/td3_bc/pen/human_v1.yaml b/configs/offline/td3_bc/pen/human_v1.yaml new file mode 100644 index 00000000..08f6d80e --- /dev/null +++ b/configs/offline/td3_bc/pen/human_v1.yaml @@ -0,0 +1,22 @@ +alpha: 2.5 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: pen-human-v1 +eval_freq: 5000 +expl_noise: 0.1 +group: td3-bc-adroit-pen-human-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: TD3-BC +noise_clip: 0.5 +normalize: true +normalize_reward: false +policy_freq: 2 +policy_noise: 0.2 +project: CORL +seed: 0 +tau: 0.005 diff --git a/configs/offline/td3_bc/relocate/cloned_v1.yaml b/configs/offline/td3_bc/relocate/cloned_v1.yaml new file mode 100644 index 00000000..8b7f948c --- /dev/null +++ b/configs/offline/td3_bc/relocate/cloned_v1.yaml @@ -0,0 +1,22 @@ +alpha: 2.5 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: relocate-cloned-v1 +eval_freq: 5000 +expl_noise: 0.1 +group: td3-bc-adroit-relocate-cloned-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: TD3-BC +noise_clip: 0.5 +normalize: true +normalize_reward: false +policy_freq: 2 +policy_noise: 0.2 +project: CORL +seed: 0 +tau: 0.005 diff --git a/configs/offline/td3_bc/relocate/expert_v1.yaml b/configs/offline/td3_bc/relocate/expert_v1.yaml new file mode 100644 index 00000000..c60eaf7a --- /dev/null +++ b/configs/offline/td3_bc/relocate/expert_v1.yaml @@ -0,0 +1,22 @@ +alpha: 2.5 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: relocate-expert-v1 +eval_freq: 5000 +expl_noise: 0.1 +group: td3-bc-adroit-relocate-expert-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: TD3-BC +noise_clip: 0.5 +normalize: true +normalize_reward: false +policy_freq: 2 +policy_noise: 0.2 +project: CORL +seed: 0 +tau: 0.005 diff --git a/configs/offline/td3_bc/relocate/human_v1.yaml b/configs/offline/td3_bc/relocate/human_v1.yaml new file mode 100644 index 00000000..6e10b069 --- /dev/null +++ b/configs/offline/td3_bc/relocate/human_v1.yaml @@ -0,0 +1,22 @@ +alpha: 2.5 +batch_size: 256 +buffer_size: 10000000 +checkpoints_path: null +device: cuda +discount: 0.99 +env: relocate-human-v1 +eval_freq: 5000 +expl_noise: 0.1 +group: td3-bc-adroit-relocate-human-v1-multiseed-v0 +load_model: '' +max_timesteps: 1000000 +n_episodes: 10 +name: TD3-BC +noise_clip: 0.5 +normalize: true +normalize_reward: false +policy_freq: 2 +policy_noise: 0.2 +project: CORL +seed: 0 +tau: 0.005 diff --git a/docker/dev/requirements/requirements.txt b/docker/dev/requirements/requirements.txt index 00e3db5e..8d0b9e3c 100644 --- a/docker/dev/requirements/requirements.txt +++ b/docker/dev/requirements/requirements.txt @@ -7,5 +7,11 @@ numpy==1.23.1 gym[mujoco_py,classic_control]==0.23.0 --extra-index-url https://download.pytorch.org/whl/cu113 torch==1.11.0+cu113 -sortedcontainers==2.4.0 pyrallis==0.3.1 +--find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html +jax==0.4.1 +jaxlib[cuda11_cudnn82]==0.4.1 +flax==0.6.1 +optax==0.1.3 +distrax==0.1.2 +chex==0.1.5 diff --git a/docker/dev/requirements/requirements_dev.txt b/docker/dev/requirements/requirements_dev.txt index c1916cb5..e0e489b0 100644 --- a/docker/dev/requirements/requirements_dev.txt +++ b/docker/dev/requirements/requirements_dev.txt @@ -7,8 +7,13 @@ numpy==1.23.1 gym[mujoco_py,classic_control]==0.23.0 --extra-index-url https://download.pytorch.org/whl/cu113 torch==1.11.0+cu113 -sortedcontainers==2.4.0 pyrallis==0.3.1 -pre-commit==2.20.0 -catalyst-codestyle==21.9.2 -pytest==7.1.2 \ No newline at end of file +pre-commit==3.3.3 +ruff==0.0.278 +--find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html +jax==0.4.1 +jaxlib[cuda11_cudnn82]==0.4.1 +flax==0.6.1 +optax==0.1.3 +distrax==0.1.2 +chex==0.1.5 diff --git a/pyproject.toml b/pyproject.toml index e042e09b..24607c5b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,10 @@ -[tool.nitpick] -style = "https://raw.githubusercontent.com/catalyst-team/codestyle/v21.09.2/styles/nitpick-style-catalyst.toml" +[tool.ruff] +select = ["E", "F", "I001", "RUF100"] +ignore = ["E402"] +line-length = 89 +target-version = "py39" -[tool.black] -line-length = 89 \ No newline at end of file +[tool.ruff.isort] +combine-as-imports = true +lines-after-imports = 1 +order-by-type = false \ No newline at end of file diff --git a/results/README.md b/results/README.md new file mode 100644 index 00000000..6a3129ec --- /dev/null +++ b/results/README.md @@ -0,0 +1,6 @@ +# Reproducing figures and tables + +To reproduce all figures and tables from the paper, do the following steps: +1. Run `get_{offline, finetune}_urls.py` if needed. These scripts collect all wandb logs into .csv files and save them into the `runs_tables` folder. We provide the tables, but you can recollect them. +2. Run `get_{offline, finetune}_scores.py` if needed. These scripts collect data from runs kept in .csv files and save evaluation scores (and regret in case of offline-to-online) into pickled files, which are stored in the `bin` folder. We provide the pickled data, but if you need to extract more data, you can modify scripts for your purposes. +3. Run `get_{offline, finetune}_tables_and_plots.py`. These scripts use pickled data, print all the tables, and save all figures into the `out` directory. diff --git a/results/bin/finetune_scores.pickle b/results/bin/finetune_scores.pickle new file mode 100644 index 00000000..d60377ec Binary files /dev/null and b/results/bin/finetune_scores.pickle differ diff --git a/results/bin/offline_scores.pickle b/results/bin/offline_scores.pickle new file mode 100644 index 00000000..f1fb38c4 Binary files /dev/null and b/results/bin/offline_scores.pickle differ diff --git a/results/get_finetune_scores.py b/results/get_finetune_scores.py new file mode 100644 index 00000000..dd31ad19 --- /dev/null +++ b/results/get_finetune_scores.py @@ -0,0 +1,59 @@ +import os +import pickle + +import pandas as pd +import wandb +from tqdm import tqdm + +dataframe = pd.read_csv("runs_tables/finetune_urls.csv") + +api = wandb.Api(timeout=29) + + +def get_run_scores(run_id, is_dt=False): + run = api.run(run_id) + score_key = None + full_scores = [] + regret = None + max_dt = -1e10 + + for k in run.history().keys(): + if "normalized" in k and "score" in k and "std" not in k: + if is_dt: + st = k + if "eval/" in st: + st = st.replace("eval/", "") + target = float(st.split("_")[0]) + if target > max_dt: + max_dt = target + score_key = k + else: + score_key = k + break + for _, row in run.history(keys=[score_key], samples=5000).iterrows(): + full_scores.append(row[score_key]) + for _, row in run.history(keys=["eval/regret"], samples=5000).iterrows(): + if "eval/regret" in row: + regret = row["eval/regret"] + offline_iters = len(full_scores) // 2 + return full_scores[:offline_iters], full_scores[offline_iters:], regret + + +def process_runs(df): + algorithms = df["algorithm"].unique() + datasets = df["dataset"].unique() + full_scores = {algo: {ds: [] for ds in datasets} for algo in algorithms} + for _, row in tqdm( + df.iterrows(), desc="Runs scores downloading", position=0, leave=True + ): + full_scores[row["algorithm"]][row["dataset"]].append( + get_run_scores(row["url"], row["algorithm"] == "DT") + ) + return full_scores + + +full_scores = process_runs(dataframe) + +os.makedirs("bin", exist_ok=True) +with open("bin/finetune_scores.pickle", "wb") as handle: + pickle.dump(full_scores, handle, protocol=pickle.HIGHEST_PROTOCOL) diff --git a/results/get_finetune_tables_and_plots.py b/results/get_finetune_tables_and_plots.py new file mode 100644 index 00000000..d8072652 --- /dev/null +++ b/results/get_finetune_tables_and_plots.py @@ -0,0 +1,429 @@ +import os +import pickle + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +from rliable import library as rly, metrics, plot_utils + +dataframe = pd.read_csv("runs_tables/finetune_urls.csv") +with open("bin/finetune_scores.pickle", "rb") as handle: + full_scores = pickle.load(handle) + +os.makedirs("./out", exist_ok=True) + + +def get_average_scores(scores): + avg_scores = {algo: {ds: None for ds in scores[algo]} for algo in scores} + stds = {algo: {ds: None for ds in scores[algo]} for algo in scores} + for algo in scores: + for data in scores[algo]: + sc = scores[algo][data] + if len(sc) > 0: + ml = min(map(len, sc)) + sc = [s[:ml] for s in sc] + scores[algo][data] = sc + avg_scores[algo][data] = np.mean(sc, axis=0) + stds[algo][data] = np.std(sc, axis=0) + + return avg_scores, stds + + +def get_max_scores(scores): + avg_scores = {algo: {ds: None for ds in scores[algo]} for algo in scores} + stds = {algo: {ds: None for ds in scores[algo]} for algo in scores} + for algo in scores: + for data in scores[algo]: + sc = scores[algo][data] + if len(sc) > 0: + ml = min(map(len, sc)) + sc = [s[:ml] for s in sc] + scores[algo][data] = sc + max_scores = np.max(sc, axis=1) + avg_scores[algo][data] = np.mean(max_scores) + stds[algo][data] = np.std(max_scores) + + return avg_scores, stds + + +def get_last_scores(avg_scores, avg_stds): + last_scores = { + algo: { + ds: avg_scores[algo][ds][-1] if avg_scores[algo][ds] is not None else None + for ds in avg_scores[algo] + } + for algo in avg_scores + } + stds = { + algo: { + ds: avg_stds[algo][ds][-1] if avg_stds[algo][ds] is not None else None + for ds in avg_scores[algo] + } + for algo in avg_scores + } + return last_scores, stds + + +full_offline_scores = { + algo: {data: None for data in full_scores[algo]} for algo in full_scores +} +full_online_scores = { + algo: {data: None for data in full_scores[algo]} for algo in full_scores +} +regrets = {algo: {data: None for data in full_scores[algo]} for algo in full_scores} +regrets_std = {algo: {data: None for data in full_scores[algo]} for algo in full_scores} + +for algo in full_offline_scores: + for data in full_offline_scores[algo]: + full_offline_scores[algo][data] = [s[0] for s in full_scores[algo][data]] + full_online_scores[algo][data] = [s[1] for s in full_scores[algo][data]] + regrets[algo][data] = np.mean([s[2] for s in full_scores[algo][data]]) + regrets_std[algo][data] = np.std([s[2] for s in full_scores[algo][data]]) + +avg_offline_scores, avg_offline_stds = get_average_scores(full_offline_scores) +max_offline_scores, max_offline_stds = get_max_scores(full_offline_scores) +last_offline_scores, last_offline_stds = get_last_scores( + avg_offline_scores, avg_offline_stds +) + +avg_online_scores, avg_online_stds = get_average_scores(full_online_scores) +max_online_scores, max_online_stds = get_max_scores(full_online_scores) +last_online_scores, last_online_stds = get_last_scores( + avg_online_scores, avg_online_stds +) + + +def add_domains_avg(scores): + for algo in scores: + antmaze = [ + scores[algo][data] + for data in [ + "antmaze-umaze-v2", + "antmaze-umaze-diverse-v2", + "antmaze-medium-play-v2", + "antmaze-medium-diverse-v2", + "antmaze-large-play-v2", + "antmaze-large-diverse-v2", + ] + ] + adroit = [ + scores[algo][data] + for data in [ + "pen-cloned-v1", + "door-cloned-v1", + "hammer-cloned-v1", + "relocate-cloned-v1", + ] + ] + + scores[algo]["antmaze avg"] = np.mean(antmaze) + scores[algo]["adroit avg"] = np.mean(adroit) + + scores[algo]["total avg"] = np.mean(np.hstack((antmaze, adroit))) + + +add_domains_avg(last_offline_scores) +add_domains_avg(last_online_scores) +add_domains_avg(regrets) + +algorithms = ["AWAC", "CQL", "IQL", "SPOT", "Cal-QL"] +datasets = dataframe["dataset"].unique() +ordered_datasets = [ + "antmaze-umaze-v2", + "antmaze-umaze-diverse-v2", + "antmaze-medium-play-v2", + "antmaze-medium-diverse-v2", + "antmaze-large-play-v2", + "antmaze-large-diverse-v2", + "antmaze avg", + "pen-cloned-v1", + "door-cloned-v1", + "hammer-cloned-v1", + "relocate-cloned-v1", + "adroit avg", + "total avg", +] + + +def get_table( + scores, + stds, + pm="$\\pm$", + delim=" & ", + row_delim="\\midrule", + row_end=" \\\\", + row_begin="", + scores2=None, + stds2=None, + scores_delim=" $\\to$ ", +): + rows = [row_begin + delim.join(["Task Name"] + algorithms) + row_end] + prev_env = "halfcheetah" + for data in ordered_datasets: + env = data.split("-")[0] + if env != prev_env: + if len(row_delim) > 0: + rows.append(row_delim) + prev_env = env + + row = [data] + + for algo in algorithms: + if data in stds[algo]: + row.append( + f"{scores[algo][data]:.2f} {pm} {stds[algo][data]:.2f}" + + ( + "" + if scores2 is None + else f"{scores_delim} {scores2[algo][data]:.2f} {pm} {stds2[algo][data]:.2f}" # noqa + ) + ) + else: + row.append( + f"{scores[algo][data]:.2f}" + + ( + "" + if scores2 is None + else f"{scores_delim} {scores2[algo][data]:.2f}" + ) + ) + rows.append(row_begin + delim.join(row) + row_end) + return "\n".join(rows) + + +print( + get_table( + last_offline_scores, + last_offline_stds, + scores2=last_online_scores, + stds2=last_online_stds, + ) +) +print() +print(get_table(regrets, regrets_std, "$\\pm$")) +print() +print( + get_table( + last_offline_scores, + last_offline_stds, + "±", + "|", + "", + "|", + "|", + scores2=last_online_scores, + stds2=last_online_stds, + scores_delim=" -> ", + ) +) +print() +print(get_table(regrets, regrets_std, "±", "|", "", "|", "|")) + +"""# Tunning plots""" + +plt.rcParams["figure.figsize"] = (15, 8) +plt.rcParams["figure.dpi"] = 300 +sns.set(style="ticks", font_scale=1.5) +# plt.rcParams.update({ +# "font.family": "serif", +# "font.serif": "Times New Roman" +# }) +# sns.set_palette("tab19") + +linestyles = [ + ("solid", "solid"), + ("dotted", (0, (1, 1))), + ("long dash with offset", (5, (10, 3))), + ("densely dashed", (0, (5, 1))), + ("densely dashdotted", (0, (3, 1, 1, 1))), + ("densely dashdotdotted", (0, (3, 1, 1, 1, 1, 1))), +] + +for data in datasets: + min_score = 1e6 + max_score = -1e6 + for i, algo in enumerate(algorithms): + if avg_online_scores[algo][data] is not None: + to_draw = avg_online_scores[algo][data] + std_draw = avg_online_stds[algo][data] + if len(to_draw) == 600 or len(to_draw) == 601: + to_draw = to_draw[::3] + std_draw = std_draw[::3] + if len(to_draw) == 1000: + to_draw = to_draw[::5] + std_draw = std_draw[::5] + if len(to_draw) == 3000: + to_draw = to_draw[::15] + std_draw = std_draw[::15] + steps = np.linspace(0, 1, len(to_draw)) + min_score = min(min_score, np.min(to_draw)) + max_score = max(max_score, np.max(to_draw)) + plt.plot( + steps, to_draw, label=algo, linestyle=linestyles[i % len(linestyles)][1] + ) + plt.fill_between(steps, to_draw - std_draw, to_draw + std_draw, alpha=0.1) + + plt.title(data) + plt.xlabel("Fraction of total tuning steps") + plt.ylabel("Normalized score") + plt.ylim([min_score - 3, max_score + 3]) + plt.legend(loc="center left", bbox_to_anchor=(1, 0.5)) + plt.grid() + plt.savefig(f"out/tuning_{data}.pdf", dpi=300, bbox_inches="tight") + # plt.show() + plt.close() + + +def convert_dataset_name(name): + name = name.replace("v2", "") + name = name.replace("v1", "") + name = name.replace("v0", "") + name = name.replace("medium-", "m-") + name = name.replace("umaze-", "u-") + name = name.replace("large-", "l-") + name = name.replace("replay-", "re-") + name = name.replace("random-", "ra-") + name = name.replace("expert-", "e-") + name = name.replace("play-", "p-") + name = name.replace("diverse-", "d-") + name = name.replace("human-", "h-") + name = name.replace("cloned-", "c-") + return name[:-1] + + +def plot_bars(scores, save_name): + agg_l = [] + + for env in ["antmaze", "pen", "door", "hammer", "relocate"]: + if env in ["halfcheetah", "hopper", "walker2d"]: + datas = ["medium-v2", "medium-expert-v2", "medium-replay-v2"] + elif "maze2d" in env: + datas = ["umaze-v1", "medium-v1", "large-v1"] + elif "antmaze" in env: + datas = [ + "umaze-v2", + "umaze-diverse-v2", + "medium-play-v2", + "medium-diverse-v2", + "large-play-v2", + "large-diverse-v2", + ] + else: + datas = ["cloned-v1"] + for data in datas: + line = convert_dataset_name(f"{env}-{data}") + for algo in algorithms: + agg_l.append([algo, line, scores[algo][f"{env}-{data}"]]) + df_agg = pd.DataFrame(agg_l, columns=["Algorithm", "Dataset", "Normalized Score"]) + + sns.set(style="ticks", font_scale=2) + plt.rcParams["figure.figsize"] = (20, 10) # (10, 6) + + b = sns.barplot( + data=df_agg[df_agg.Dataset.apply(lambda x: "ant" in x)], + x="Dataset", + y="Normalized Score", + hue="Algorithm", + ) + # plt.tight_layout() + plt.xticks(fontsize=30) + plt.yticks(fontsize=30) + plt.legend(fontsize=10) + plt.xticks(rotation=45) + sns.move_legend(b, "upper left", bbox_to_anchor=(1, 1)) + plt.grid() + + plt.savefig(f"out/bars_{save_name}_ant.pdf", dpi=300, bbox_inches="tight") + # plt.show() + plt.close() + + b = sns.barplot( + data=df_agg[ + df_agg.Dataset.apply( + lambda x: "pen" in x or "hammer" in x or "door" in x or "relocate" in x + ) + ], + x="Dataset", + y="Normalized Score", + hue="Algorithm", + ) + plt.grid() + # plt.tight_layout() + plt.xticks(fontsize=30) + plt.yticks(fontsize=30) + plt.legend(fontsize=10) + plt.xticks(rotation=45) + sns.move_legend(b, "upper left", bbox_to_anchor=(1, 1)) + plt.savefig(f"out/bars_{save_name}_adroit.pdf", dpi=300, bbox_inches="tight") + # plt.show() + plt.close() + + +plot_bars(last_online_scores, "last_online") + +plt.rcParams["figure.figsize"] = (10, 6) +plt.rcParams["figure.dpi"] = 300 +sns.set(style="ticks", font_scale=0.5) +plt.rcParams.update( + { + # "font.family": "serif", + "font.serif": "Times New Roman" + } +) + + +def flatten(data): + res = {} + for algo in data: + flat = [] + for env in data[algo]: + if "avg" not in env: + env_list = np.array(data[algo][env])[:, -1] + flat.append(env_list) + res[algo] = np.array(flat).T + return res + + +flat = flatten(full_online_scores) + +algorithms = list(flat) + +normalized_score_dict = flat + +# Human normalized score thresholds +thresholds = np.linspace(-5.0, 150.0, 31) +score_distributions, score_distributions_cis = rly.create_performance_profile( + normalized_score_dict, thresholds +) +# Plot score distributions +fig, ax = plt.subplots(ncols=1, figsize=(7, 5)) +# plt.legend() +plot_utils.plot_performance_profiles( + score_distributions, + thresholds, + performance_profile_cis=score_distributions_cis, + colors=dict(zip(algorithms, sns.color_palette("colorblind"))), + xlabel=r"D4RL Normalized Score $(\tau)$", + ax=ax, + legend=True, +) +plt.savefig("out/perf_profiles_online.pdf", dpi=300, bbox_inches="tight") + +algorithm_pairs = {} +sns.set(style="ticks", font_scale=0.5) +algs = [ + "SPOT", + "CQL", + "IQL", + "AWAC", +] +for a1 in ["Cal-QL"]: + for a2 in algs: + algorithm_pairs[f"{a1},{a2}"] = (flat[a1], flat[a2]) +average_probabilities, average_prob_cis = rly.get_interval_estimates( + algorithm_pairs, metrics.probability_of_improvement, reps=200 +) +ax = plot_utils.plot_probability_of_improvement(average_probabilities, average_prob_cis) +# ax.set_xlim(0.5, 0.8) +plt.savefig("out/improvement_probability_online.pdf", dpi=300, bbox_inches="tight") diff --git a/results/get_finetune_urls.py b/results/get_finetune_urls.py new file mode 100644 index 00000000..12d7e374 --- /dev/null +++ b/results/get_finetune_urls.py @@ -0,0 +1,46 @@ +import pandas as pd +import wandb + +collected_urls = { + "algorithm": [], + "dataset": [], + "url": [], +} + + +def get_urls(sweep_id, algo_name): + s = sweep_id + api = wandb.Api(timeout=39) + sweep = api.sweep(s) + runs = sweep.runs + for run in runs: + if "env" in run.config: + dataset = run.config["env"] + elif "env_name" in run.config: + dataset = run.config["env_name"] + name = algo_name + if "10" in "-".join(run.name.split("-")[:-1]): + name = "10% " + name + if "medium" not in dataset: + if "cheetah" in dataset or "hopper" in dataset or "walker" in dataset: + continue + if "v0" not in dataset and "dense" not in dataset: + print(name, dataset, run.url) + collected_urls["algorithm"].append(name) + collected_urls["dataset"].append(dataset) + collected_urls["url"].append(run.url.replace("https://wandb.ai/", "")) + + +get_urls("tlab/CORL/sweeps/7c42z4dz", "SPOT") + +get_urls("tlab/CORL/sweeps/l3an1ck7", "AWAC") + +get_urls("tlab/CORL/sweeps/snbq2jky", "CQL") + +get_urls("tlab/CORL/sweeps/ucrmi909", "IQL") + +get_urls("tlab/CORL/sweeps/efvz7d68", "Cal-QL") + +dataframe = pd.DataFrame(collected_urls) + +dataframe.to_csv("runs_tables/finetune_urls.csv", index=False) diff --git a/results/get_offline_scores.py b/results/get_offline_scores.py new file mode 100644 index 00000000..ff66df28 --- /dev/null +++ b/results/get_offline_scores.py @@ -0,0 +1,56 @@ +import os +import pickle + +import pandas as pd +import wandb +from tqdm import tqdm + +dataframe = pd.read_csv("runs_tables/offline_urls.csv") + +api = wandb.Api(timeout=29) + + +def get_run_scores(run_id, is_dt=False): + run = api.run(run_id) + score_key = None + all_scores = [] + max_dt = -1e10 + + for k in run.history().keys(): + if "normalized" in k and "score" in k and "std" not in k: + if is_dt: + st = k + if "eval/" in st: + st = st.replace("eval/", "") + target = float(st.split("_")[0]) + if target > max_dt: + max_dt = target + score_key = k + else: + score_key = k + break + for _, row in run.history(keys=[score_key], samples=5000).iterrows(): + all_scores.append(row[score_key]) + return all_scores + + +def process_runs(df): + algorithms = df["algorithm"].unique() + datasets = df["dataset"].unique() + full_scores = {algo: {ds: [] for ds in datasets} for algo in algorithms} + for _, row in tqdm( + df.iterrows(), desc="Runs scores downloading", position=0, leave=True + ): + full_scores[row["algorithm"]][row["dataset"]].append( + get_run_scores(row["url"], row["algorithm"] == "DT") + ) + return full_scores + + +# Run if runs must be recollected +full_scores = process_runs(dataframe) + +os.makedirs("bin", exist_ok=True) + +with open("bin/offline_scores.pickle", "wb") as handle: + pickle.dump(full_scores, handle, protocol=pickle.HIGHEST_PROTOCOL) diff --git a/results/get_offline_tables_and_plots.py b/results/get_offline_tables_and_plots.py new file mode 100644 index 00000000..a8f6b4ae --- /dev/null +++ b/results/get_offline_tables_and_plots.py @@ -0,0 +1,482 @@ +import os +import pickle + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +from rliable import library as rly, metrics, plot_utils + +dataframe = pd.read_csv("runs_tables/offline_urls.csv") +with open("bin/offline_scores.pickle", "rb") as handle: + full_scores = pickle.load(handle) + +os.makedirs("./out", exist_ok=True) + + +def get_average_scores(scores): + avg_scores = {algo: {ds: None for ds in scores[algo]} for algo in scores} + stds = {algo: {ds: None for ds in scores[algo]} for algo in scores} + for algo in scores: + for data in scores[algo]: + sc = scores[algo][data] + if len(sc) > 0: + ml = min(map(len, sc)) + sc = [s[:ml] for s in sc] + scores[algo][data] = sc + avg_scores[algo][data] = np.mean(sc, axis=0) + stds[algo][data] = np.std(sc, axis=0) + + return avg_scores, stds + + +def get_max_scores(scores): + avg_scores = {algo: {ds: None for ds in scores[algo]} for algo in scores} + stds = {algo: {ds: None for ds in scores[algo]} for algo in scores} + for algo in scores: + for data in scores[algo]: + sc = scores[algo][data] + if len(sc) > 0: + ml = min(map(len, sc)) + sc = [s[:ml] for s in sc] + scores[algo][data] = sc + max_scores = np.max(sc, axis=1) + avg_scores[algo][data] = np.mean(max_scores) + stds[algo][data] = np.std(max_scores) + + return avg_scores, stds + + +def get_last_scores(avg_scores, avg_stds): + last_scores = { + algo: { + ds: avg_scores[algo][ds][-1] if avg_scores[algo][ds] is not None else None + for ds in avg_scores[algo] + } + for algo in avg_scores + } + stds = { + algo: { + ds: avg_stds[algo][ds][-1] if avg_stds[algo][ds] is not None else None + for ds in avg_scores[algo] + } + for algo in avg_scores + } + return last_scores, stds + + +avg_scores, avg_stds = get_average_scores(full_scores) +max_scores, max_stds = get_max_scores(full_scores) +last_scores, last_stds = get_last_scores(avg_scores, avg_stds) + + +def add_domains_avg(scores): + for algo in scores: + locomotion = [ + scores[algo][data] + for data in [ + "halfcheetah-medium-v2", + "halfcheetah-medium-replay-v2", + "halfcheetah-medium-expert-v2", + "hopper-medium-v2", + "hopper-medium-replay-v2", + "hopper-medium-expert-v2", + "walker2d-medium-v2", + "walker2d-medium-replay-v2", + "walker2d-medium-expert-v2", + ] + ] + antmaze = [ + scores[algo][data] + for data in [ + "antmaze-umaze-v2", + "antmaze-umaze-diverse-v2", + "antmaze-medium-play-v2", + "antmaze-medium-diverse-v2", + "antmaze-large-play-v2", + "antmaze-large-diverse-v2", + ] + ] + maze2d = [ + scores[algo][data] + for data in [ + "maze2d-umaze-v1", + "maze2d-medium-v1", + "maze2d-large-v1", + ] + ] + + adroit = [ + scores[algo][data] + for data in [ + "pen-human-v1", + "pen-cloned-v1", + "pen-expert-v1", + "door-human-v1", + "door-cloned-v1", + "door-expert-v1", + "hammer-human-v1", + "hammer-cloned-v1", + "hammer-expert-v1", + "relocate-human-v1", + "relocate-cloned-v1", + "relocate-expert-v1", + ] + ] + + scores[algo]["locomotion avg"] = np.mean(locomotion) + scores[algo]["antmaze avg"] = np.mean(antmaze) + scores[algo]["maze2d avg"] = np.mean(maze2d) + scores[algo]["adroit avg"] = np.mean(adroit) + + scores[algo]["total avg"] = np.mean( + np.hstack((locomotion, antmaze, maze2d, adroit)) + ) + + +add_domains_avg(last_scores) +add_domains_avg(max_scores) + +algorithms = [ + "BC", + "10% BC", + "TD3+BC", + "AWAC", + "CQL", + "IQL", + "ReBRAC", + "SAC-N", + "EDAC", + "DT", +] +datasets = dataframe["dataset"].unique() +ordered_datasets = [ + "halfcheetah-medium-v2", + "halfcheetah-medium-replay-v2", + "halfcheetah-medium-expert-v2", + "hopper-medium-v2", + "hopper-medium-replay-v2", + "hopper-medium-expert-v2", + "walker2d-medium-v2", + "walker2d-medium-replay-v2", + "walker2d-medium-expert-v2", + "locomotion avg", + "maze2d-umaze-v1", + "maze2d-medium-v1", + "maze2d-large-v1", + "maze2d avg", + "antmaze-umaze-v2", + "antmaze-umaze-diverse-v2", + "antmaze-medium-play-v2", + "antmaze-medium-diverse-v2", + "antmaze-large-play-v2", + "antmaze-large-diverse-v2", + "antmaze avg", + "pen-human-v1", + "pen-cloned-v1", + "pen-expert-v1", + "door-human-v1", + "door-cloned-v1", + "door-expert-v1", + "hammer-human-v1", + "hammer-cloned-v1", + "hammer-expert-v1", + "relocate-human-v1", + "relocate-cloned-v1", + "relocate-expert-v1", + "adroit avg", + "total avg", +] + +"""# Tables""" + + +def get_table( + scores, + stds, + pm="$\\pm$", + delim=" & ", + row_delim="\\midrule", + row_end=" \\\\", + row_begin="", +): + rows = [row_begin + delim.join(["Task Name"] + algorithms) + row_end] + prev_env = "halfcheetah" + for data in ordered_datasets: + env = data.split("-")[0] + if env != prev_env: + if len(row_delim) > 0: + rows.append(row_delim) + prev_env = env + + row = [data] + + for algo in algorithms: + if data in stds[algo]: + row.append(f"{scores[algo][data]:.2f} {pm} {stds[algo][data]:.2f}") + else: + row.append(f"{scores[algo][data]:.2f}") + rows.append(row_begin + delim.join(row) + row_end) + return "\n".join(rows) + + +print(get_table(last_scores, last_stds)) +print() +print(get_table(max_scores, max_stds)) +print() +print(get_table(last_scores, last_stds, "±", "|", "", "|", "|")) +print() +print(get_table(max_scores, max_stds, "±", "|", "", "|", "|")) + +os.makedirs("out", exist_ok=True) + +plt.rcParams["figure.figsize"] = (15, 8) +plt.rcParams["figure.dpi"] = 300 +sns.set(style="ticks", font_scale=1.5) + + +linestyles = [ + ("solid", "solid"), + ("dotted", (0, (1, 1))), + ("long dash with offset", (5, (10, 3))), + ("densely dashed", (0, (5, 1))), + ("densely dashdotted", (0, (3, 1, 1, 1))), + ("densely dashdotdotted", (0, (3, 1, 1, 1, 1, 1))), +] + +for data in datasets: + min_score = 1e6 + max_score = -1e6 + for i, algo in enumerate(algorithms): + if avg_scores[algo][data] is not None: + to_draw = avg_scores[algo][data] + std_draw = avg_stds[algo][data] + if len(to_draw) == 600 or len(to_draw) == 601: + to_draw = to_draw[::3] + std_draw = std_draw[::3] + if len(to_draw) == 1000: + to_draw = to_draw[::5] + std_draw = std_draw[::5] + if len(to_draw) == 3000: + to_draw = to_draw[::15] + std_draw = std_draw[::15] + steps = np.linspace(0, 1, len(to_draw)) + min_score = min(min_score, np.min(to_draw)) + max_score = max(max_score, np.max(to_draw)) + plt.plot( + steps, to_draw, label=algo, linestyle=linestyles[i % len(linestyles)][1] + ) + plt.fill_between(steps, to_draw - std_draw, to_draw + std_draw, alpha=0.1) + + plt.title(data) + plt.xlabel("Fraction of total steps") + plt.ylabel("Normalized score") + plt.ylim([min_score - 3, max_score + 3]) + plt.legend(loc="center left", bbox_to_anchor=(1, 0.5)) + plt.grid() + plt.savefig(f"out/{data}.pdf", dpi=300, bbox_inches="tight") + # plt.show() + plt.close() + + +def convert_dataset_name(name): + name = name.replace("v2", "") + name = name.replace("v1", "") + name = name.replace("v0", "") + name = name.replace("medium-", "m-") + name = name.replace("umaze-", "u-") + name = name.replace("large-", "l-") + name = name.replace("replay-", "re-") + name = name.replace("random-", "ra-") + name = name.replace("expert-", "e-") + name = name.replace("play-", "p-") + name = name.replace("diverse-", "d-") + name = name.replace("human-", "h-") + name = name.replace("cloned-", "c-") + return name[:-1] + + +def plot_bars(scores, save_name): + agg_l = [] + + for env in [ + "halfcheetah", + "hopper", + "walker2d", + "maze2d", + "antmaze", + "pen", + "door", + "hammer", + "relocate", + ]: + if env in ["halfcheetah", "hopper", "walker2d"]: + datas = ["medium-v2", "medium-expert-v2", "medium-replay-v2"] + elif "maze2d" in env: + datas = ["umaze-v1", "medium-v1", "large-v1"] + elif "antmaze" in env: + datas = [ + "umaze-v2", + "umaze-diverse-v2", + "medium-play-v2", + "medium-diverse-v2", + "large-play-v2", + "large-diverse-v2", + ] + else: + datas = ["human-v1", "cloned-v1", "expert-v1"] + for data in datas: + line = convert_dataset_name(f"{env}-{data}") + for algo in algorithms: + agg_l.append([algo, line, scores[algo][f"{env}-{data}"]]) + df_agg = pd.DataFrame(agg_l, columns=["Algorithm", "Dataset", "Normalized Score"]) + + sns.set(style="ticks", font_scale=2) + plt.rcParams["figure.figsize"] = (20, 10) # (10, 6) + + b = sns.barplot( + data=df_agg[ + df_agg.Dataset.apply( + lambda x: "cheetah" in x or "hopper" in x or "walker" in x + ) + ], + x="Dataset", + y="Normalized Score", + hue="Algorithm", + ) + plt.grid() + # plt.tight_layout() + plt.xticks(fontsize=30) + plt.yticks(fontsize=30) + plt.legend(fontsize=10) + plt.xticks(rotation=45) + sns.move_legend(b, "upper left", bbox_to_anchor=(1, 1)) + plt.savefig(f"out/bars_{save_name}_loco.pdf", dpi=300, bbox_inches="tight") + # plt.show() + plt.close() + + b = sns.barplot( + data=df_agg[df_agg.Dataset.apply(lambda x: "maze2d" in x)], + x="Dataset", + y="Normalized Score", + hue="Algorithm", + ) + # plt.tight_layout() + plt.xticks(fontsize=30) + plt.yticks(fontsize=30) + plt.legend(fontsize=10) + plt.xticks(rotation=45) + sns.move_legend(b, "upper left", bbox_to_anchor=(1, 1)) + plt.grid() + + plt.savefig(f"out/bars_{save_name}_maze.pdf", dpi=300, bbox_inches="tight") + # plt.show() + plt.close() + + b = sns.barplot( + data=df_agg[df_agg.Dataset.apply(lambda x: "ant" in x)], + x="Dataset", + y="Normalized Score", + hue="Algorithm", + ) + # plt.tight_layout() + plt.xticks(fontsize=30) + plt.yticks(fontsize=30) + plt.legend(fontsize=10) + plt.xticks(rotation=45) + sns.move_legend(b, "upper left", bbox_to_anchor=(1, 1)) + plt.grid() + + plt.savefig(f"out/bars_{save_name}_ant.pdf", dpi=300, bbox_inches="tight") + # plt.show() + plt.close() + + b = sns.barplot( + data=df_agg[ + df_agg.Dataset.apply( + lambda x: "pen" in x or "hammer" in x or "door" in x or "relocate" in x + ) + ], + x="Dataset", + y="Normalized Score", + hue="Algorithm", + ) + plt.grid() + # plt.tight_layout() + plt.xticks(fontsize=30) + plt.yticks(fontsize=30) + plt.legend(fontsize=10) + plt.xticks(rotation=45) + sns.move_legend(b, "upper left", bbox_to_anchor=(1, 1)) + plt.savefig(f"out/bars_{save_name}_adroit.pdf", dpi=300, bbox_inches="tight") + # plt.show() + plt.close() + + +plot_bars(last_scores, "last") + +plot_bars(last_scores, "max") + + +def flatten(data): + res = {} + for algo in data: + flat = [] + for env in data[algo]: + if "avg" not in env: + env_list = np.array(data[algo][env])[:, -1] + flat.append(env_list) + res[algo] = np.array(flat).T + return res + + +flat = flatten(full_scores) + +plt.rcParams["figure.figsize"] = (10, 6) +plt.rcParams["figure.dpi"] = 300 +sns.set(style="ticks", font_scale=0.5) +plt.rcParams.update( + { + # "font.family": "serif", + "font.serif": "Times New Roman" + } +) +# sns.set_palette("tab19") + +algorithms = list(flat) + +normalized_score_dict = flat + +# Human normalized score thresholds +thresholds = np.linspace(-5.0, 150.0, 31) +score_distributions, score_distributions_cis = rly.create_performance_profile( + normalized_score_dict, thresholds +) +# Plot score distributions +fig, ax = plt.subplots(ncols=1, figsize=(7, 5)) +# plt.legend() +plot_utils.plot_performance_profiles( + score_distributions, + thresholds, + performance_profile_cis=score_distributions_cis, + colors=dict(zip(algorithms, sns.color_palette("colorblind"))), + xlabel=r"D4RL Normalized Score $(\tau)$", + ax=ax, + legend=True, +) +plt.savefig("out/perf_profiles_offline.pdf", dpi=300, bbox_inches="tight") +plt.close() + +algorithm_pairs = {} +sns.set(style="ticks", font_scale=0.5) +algs = ["IQL", "AWAC", "EDAC", "SAC-N", "CQL", "TD3+BC", "DT", "BC", "10% BC"] +for a1 in ["ReBRAC"]: + for a2 in algs: + algorithm_pairs[f"{a1},{a2}"] = (flat[a1], flat[a2]) +average_probabilities, average_prob_cis = rly.get_interval_estimates( + algorithm_pairs, metrics.probability_of_improvement, reps=200 +) +ax = plot_utils.plot_probability_of_improvement(average_probabilities, average_prob_cis) +# ax.set_xlim(0.5, 0.8) +plt.savefig("out/improvement_probability_offline.pdf", dpi=300, bbox_inches="tight") +plt.close() diff --git a/results/get_offline_urls.py b/results/get_offline_urls.py new file mode 100644 index 00000000..78bd763d --- /dev/null +++ b/results/get_offline_urls.py @@ -0,0 +1,1441 @@ +import pandas as pd +import wandb + +collected_urls = { + "algorithm": [], + "dataset": [], + "url": [], +} + + +def get_urls(sweep_id, algo_name): + s = sweep_id + api = wandb.Api(timeout=39) + sweep = api.sweep(s) + runs = sweep.runs + for run in runs: + if "env" in run.config: + dataset = run.config["env"] + elif "env_name" in run.config: + dataset = run.config["env_name"] + elif "dataset_name" in run.config: + dataset = run.config["dataset_name"] + name = algo_name + if "10" in "-".join(run.name.split("-")[:-1]): + name = "10% " + name + if "medium" not in dataset: + if "cheetah" in dataset or "hopper" in dataset or "walker" in dataset: + continue + if "v0" not in dataset and "dense" not in dataset: + print(name, dataset, run.url) + collected_urls["algorithm"].append(name) + collected_urls["dataset"].append(dataset) + collected_urls["url"].append(run.url.replace("https://wandb.ai/", "")) + + +get_urls("tlab/CORL/sweeps/vs7dn9cw", "BC") + +get_urls("tlab/CORL/sweeps/n0cbyj25", "BC") + +get_urls("tlab/CORL/sweeps/uooh3e8g", "AWAC") + +get_urls("tlab/CORL/sweeps/ttrkt97z", "TD3+BC") + +get_urls("tlab/CORL/sweeps/tdn3t7wv", "IQL") + +get_urls("tlab/CORL/sweeps/0ey8ru2j", "DT") + +get_urls("tlab/CORL/sweeps/fdxa3fga", "SAC-N") + +get_urls("tlab/CORL/sweeps/ptgj7mhu", "EDAC") + +get_urls("tlab/CORL/sweeps/9828rg17", "BC") + +get_urls("tlab/CORL/sweeps/b49ukhml", "AWAC") + +get_urls("tlab/CORL/sweeps/pprq5ur4", "SAC-N") + +get_urls("tlab/CORL/sweeps/je3ac4nx", "EDAC") + +get_urls("tlab/CORL/sweeps/ymvugomt", "IQL") + +get_urls("tlab/CORL/sweeps/qkukub0n", "TD3+BC") + +get_urls("tlab/CORL/sweeps/nvn328sg", "DT") + +get_urls("tlab/CORL/sweeps/2d0jczwg", "IQL") + +get_urls("tlab/CORL/sweeps/9jtj053n", "TD3+BC") + +get_urls("tlab/CORL/sweeps/zzglpp0f", "BC") + +get_urls("tlab/CORL/sweeps/hp7tiw93", "CQL") + +get_urls("tlab/CORL/sweeps/3ui4jhet", "SAC-N") + +get_urls("tlab/CORL/sweeps/uhgujgoy", "AWAC") + +get_urls("tlab/CORL/sweeps/0v8tnh8y", "BC") + +get_urls("tlab/CORL/sweeps/e1e6fzv1", "BC") + +get_urls("tlab/CORL/sweeps/sg1hx5v7", "DT") + +get_urls("tlab/CORL/sweeps/nev3j9wx", "EDAC") + +get_urls("tlab/CORL/sweeps/03sfog1g", "ReBRAC") + +# OLD RUNS +# BC +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/gae6mjr6") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/3dda9gfw") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/3sgbj9n0") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/67eno4ma") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/3bur5hke") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/330z0l2v") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/1i05t3vj") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/k9yfle3x") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/1zreo8zw") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/18vbgvb2") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/ky3vncuf") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/3tz0z6nh") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/31dmbfoz") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/1rhop7f6") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/2q070txr") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/sbcrq218") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/28iujcoa") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/2f12hcq3") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/1ptuak40") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/36y8187b") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/3bn0h2zy") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/3joz13bc") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/3s9l1a83") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/1q966noh") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/2b85pbgd") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/ca0nxbh4") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/1ipey1bk") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/x35k6x12") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/1owdjob7") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/xoosoz9n") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/3r09yx27") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/3k5v2mso") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/39tqleqs") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/9cddvu7a") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/17v5isiw") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/2a8wzq2t") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/1tgqpiks") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/19yfj5xu") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/2bneh6uw") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/3twop214") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/rhkaisgq") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/287bzpdd") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/l2gfzbhg") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/3gnugxzy") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/2uwtj2md") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/60yn1nfx") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/2p0w55iq") + +collected_urls["algorithm"].append("BC") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/2rv6pvln") + +# 10% BC +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/84b74c6e-bc52-4083-a601-6a387726c61d") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/e22c302b-e387-4d12-a498-db1c7b787306") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/c76a5b7c-f459-498e-9aa9-6c0366ded313") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/dafaa4dc-9359-4feb-be9b-39c3dcadcdd4") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/7aff87ac-17e1-49a8-b52d-a210c9be9eee") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/d14de446-beea-413f-ad5e-c90dfd0e790c") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/d4713f18-520a-459e-80a6-0acd70d0710f") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/dfbcb740-26ca-4bbf-9065-ad3ecd60c261") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/134273d4-5eb7-4e42-a62b-b3a387a7a2a4") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/f6b33b84-b8c4-42a9-aae4-0d12db4f8b92") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/c8dff5d6-4b22-4e7f-a3b1-5913ae9b0aed") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/6d454981-bf52-4126-b4bc-436e566b76be") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/5d7df542-1567-462f-8885-8c8a0e8a5d19") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/d1d0f883-1b1d-4429-8c3c-02de6c989cdb") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/8ccf19da-a0e6-4267-a53a-276349aea3be") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/2c0ea1a2-614b-414a-b6fc-baa9663891da") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/3cc3a7f7-8ff0-497c-a6e0-e6c5c5ca9688") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/c0de3f56-a236-44a4-a532-04064af81b18") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/f2f1507a-9066-4df1-962e-a3d9bed3015a") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/6313d5cf-9158-4585-9f48-cccbe1ff16f1") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/ba6e7a6d-2548-4d8a-a35f-286782c3658e") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/ab521663-97d4-4b00-a992-b602d495f7d7") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/f6c1e15a-23d4-472d-846f-e766a835d67b") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/482908a6-eb2e-4b3d-8254-0ef0124f488e") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/7fc8e114-0c73-4c47-977a-7f8d337dac1f") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/c2e4d867-a355-4030-b23f-e9845da0c4bf") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/cec9a1e2-a270-4270-861b-88535dcd4103") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/dcc5696c-bc69-41a3-a4f7-2865a16651ef") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/b86f27c4-05d0-43d8-b95e-b81edeb45144") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/364433ae-2974-48c7-a8e5-fc7606dbc819") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/ba1ae355-2945-4c82-a7be-49e421b59574") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/c9b94c6c-8a73-4259-848b-61f7b9386309") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/323e3a40-e919-4dd6-9d97-3e6f7a01b118") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/6065ffc6-8cee-45d8-b2e5-a600922a89cc") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/b418e6f1-1fcc-43dc-b5e3-475c17d3da1a") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/9b7add9a-d916-4ac8-9538-09d82ea6a7c4") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/0155fffe-76ae-4580-ba4a-c90d8c83c8d6") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/e7ea6fec-ac94-483f-af5a-c20790569efd") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/af373d51-823c-4ebc-b863-3ffefb6ad5f0") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/82e587c5-afc5-47f3-b71c-734472174a19") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/1bca103d-fa9b-405f-a4c3-f4f5aee161c1") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/706ea73c-c148-4f2f-96c6-347e600ae566") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/e51f8235-0ea3-4eb5-a2ff-67d159404783") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/5cd02078-1a5b-4721-9070-c8a5d7bce477") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/40eaf786-7305-46a0-8b4c-2dc608c9cf34") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/4bceaa03-d8e6-4ec5-b417-d1007f4a7504") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/e1f340a7-f659-4143-8c76-22d341532e9c") + +collected_urls["algorithm"].append("10% BC") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/df22f73b-3904-4d3d-be82-8565a94f90a9") + +# +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/3gmwuspv") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/hfnz06jo") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/22zd4qy5") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/2je1ydbq") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/2cn5kybz") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/4wfevsn1") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/8uc5g9vl") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/3q3i7kr4") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/1383sspe") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/ujqk6bcx") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/2har775v") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/1t9zpxwq") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/1manw8ou") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/glmwyvtm") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/99lixj21") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/21qd6jdk") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/13i7gvdv") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/lfnzn3ek") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/2iqxrf7v") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/28q8k0is") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/2klwm3m9") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/vgj8gxc9") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/1zpikd1i") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/3mhuu91m") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/o9cy1xot") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/9oorg18b") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/8umnr31k") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/8ay8wua0") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/36r6bciu") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/3dhx3yws") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/2xgt4p29") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/2i8f6fsw") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/1pocua7w") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/3apac4jp") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/3axkszn9") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/iyy3p627") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/2evz37in") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/rcuf9ji6") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/2nguxmuw") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/563x3nqx") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/3pp38z95") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/c7htx54f") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/35i1e9k3") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/34kpercv") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/1y6a1ghl") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/1r5ja7w3") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/2ksjowc8") + +collected_urls["algorithm"].append("TD3+BC") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/1v789w9r") + +# +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/f5447eae-38f5-404e-ab97-979d12a62dba") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/063ec049-6092-46fd-8d06-5c43aa0c8933") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/517996bc-48dd-4cc5-a1a2-b599668dfb03") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/cdb110c8-baed-4b72-9338-e2df069c1999") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/863ba3ad-2e15-4027-a561-50a1ce837a2e") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/a120a194-2a4d-493f-a105-29e81c2167f3") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/db99a51a-20ec-4898-b432-7bed581b11eb") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/ef619bf1-e43f-4ca0-b26a-e44a79c8d6c4") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/d61f15f2-bb63-4b0e-8a3f-0a8397f85c99") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/bc356f6c-ff8a-4fcb-8f7d-eda711bf187f") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/e55c1f59-4a22-4adf-90db-55b761184c31") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/754eb9df-300b-4816-b483-1ecc8630d170") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/fcdf10b7-3f06-4950-89e5-0bb706d32fa2") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/3149b249-61b7-42b5-b62c-560263073ceb") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/e3f4068c-2f7a-4d98-8bfe-71e5bcd37f60") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/07bafbb5-cef0-487f-9d18-43f5e6f41e5b") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/bdc16cb0-7ba1-44e5-a634-f7821849e911") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/1c63037a-0f9e-4c92-8e30-f868e5899235") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/49ccdf3d-49f8-43f7-ae5e-5f2166928b08") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/86e2bdf2-bfc8-4dd8-b245-06f3c5948525") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/b7865c5a-6382-4dfe-967d-f5f41caef859") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/1a9ae20a-0ef3-4517-aa21-0114606e8e44") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/68993e5b-f477-496e-ab8c-da7808851e31") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/d9682650-69b2-4cce-832c-a0a5d63d7b87") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/51b5a164-e6ab-4929-bf76-b786a3e40654") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/abd10b19-e2c5-4e27-99ac-2ca8445acd51") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/5c0c2cb0-2457-40dc-905b-8bf32b8a75fe") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/98977940-fab9-462c-ac70-3fcd10bc55cb") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/a513ea52-a879-47a6-ab4c-ac1a046b5cc2") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/0cffd41b-d983-4b45-93c8-2e22fc5801c0") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/c7b8a1c8-170f-4060-860c-62553ff67911") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/7df0497b-d805-47ce-91ba-485d7bff6fb6") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/3db49470-beba-49f8-963b-bc7fbe79d107") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/21fea44e-168d-4356-a72c-1ac09a482d05") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/60a8e98b-5933-491e-83c7-f48b777fb52e") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/7eaf035d-9394-4eee-97f0-50347b108b6a") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/76b97aeb-4327-4fb1-bbd4-572f84b9ac6c") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/2eaf20df-c7d2-42c7-9d6f-5f29e240b99f") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/fa033830-cec7-4144-894d-741391fdb81d") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/04917eeb-b7a5-4e02-9e89-7eed774cd00b") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/d296d6ef-8a37-4c39-be14-ab54eb85a0ee") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/825a83d5-0ed4-4c97-9c79-13edfa43e6cc") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/277df654-7035-4469-8150-ff3df3f6230e") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/6428588e-c9bc-43ba-a945-285248e0664b") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/0d1ae046-abcb-4da1-b2d3-1360bbd8f54f") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/9eb231d9-6c25-4d42-9564-90164b7e680b") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/f4c212ba-7b8e-428e-9953-71606fd84d67") + +collected_urls["algorithm"].append("DT") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/3bc164b8-1fc0-4ce5-a32d-701e522ad5b1") + +# +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/a7e3d2a0-2dbc-4eba-b28d-8315f992bae3") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/65981364-10fc-47d3-bb35-ccc67254ca23") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/ceb4bd07-50d4-426c-9e2b-a54fc4a1092a") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/a2fe5d76-b680-42b1-aafa-4f7fae8e9575") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/342b9c5e-eb78-45b1-99fc-97654d2d619a") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/eaab4d73-b002-4587-89e9-b101efc5c385") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/f83b4b8c-bddd-469a-acf5-c2c59b80fd3c") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/4c2065f4-e773-4760-a045-18958aff4685") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/eef336bc-42f0-46bc-90df-17d6b5647263") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/16b37de3-9011-4a20-b58a-d1d97946125a") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/81bdccf5-1ce7-4ab5-9228-1193209b9f85") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/700bc2bd-3ae8-4845-a5a7-ea9ce5a5bf68") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/c0015d64-2bce-4bf7-a804-92390d022ec9") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/f7a045fb-89de-4df1-a827-0b0aff6fa803") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/c61cc412-51fa-41ef-be06-5e8eaba5272e") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/e08593b0-edc7-49a7-bf68-e66e613ed20f") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/3be8a859-82e5-4cc2-899d-4ff7f88a90ed") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/c5dd3800-eed4-4711-8172-0d22bc985ed9") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/ff761882-9f47-4f3b-8cf9-0f5cf0b40339") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/0257eae7-716d-4c68-b8a2-1d99c74d79d0") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/8c18b80d-028d-48dd-a371-b2fab308469a") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/c86ba1cc-8b4c-4dd8-b64d-8f57a8131d95") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/bc5fda0c-2f5c-4391-8bd5-c4f2e15c2e0c") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/c3fdffef-f3cb-4d18-9d94-af4e0651ba21") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/95c7d8e0-f634-403a-8edb-ea00afd5c69c") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/4580d97f-15b0-4d54-887c-91cf0a3368ea") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/ad47291b-1469-48b5-ba20-266a05bc9326") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/16f77985-8033-4953-8066-c33c49141581") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/69bf1797-94b0-43fa-b22c-a6406a93d222") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/dadbb413-ae11-48bb-a4bb-94c8b4c7d53f") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/c1db8aa9-9bfc-4687-a8b5-6096c90f6e9b") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/b6ff762e-c0be-4b6d-ac23-8b5ffcb28a56") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/ab688db2-ab1d-4d96-ba40-6186c7ecb16b") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/d0a5c6be-7b64-4ddb-b965-1ae8e0533363") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/9f67f421-c55b-4527-8ea0-8e6579a3bb61") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/ab44a4d1-6aee-420e-b691-307bd083d2ea") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/3394eb73-a8b3-463c-9a57-8dd65833ecdd") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/89527361-8f90-47a5-8882-ac3459de0d0a") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/f02528e5-86d6-4242-961f-106cb0e5df14") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/132a99bc-386a-4eb4-a64c-74699d0563b5") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/33ce900d-b858-4bc3-a6dc-71f9615cfad5") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/87addd3a-42bd-45b7-8dcb-a921dfa6dad5") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/bcfb639c-1d44-4228-bbd8-e560b48bb5d6") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/249f88e4-c98f-401f-bb36-4d5f239fff74") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/fc7fa907-ab00-457d-a00d-2bdd65688379") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/20f7258d-0f07-4002-86b2-4c3ec65ee067") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/c3e71147-80a2-4ae8-bb59-9b994daaa516") + +collected_urls["algorithm"].append("SAC-N") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/e36a72da-482f-4a70-803f-1a0d7eccb265") + +# +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/1m3k2bd1") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/3jzf46zg") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/exlzrv4v") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/3r2qku3k") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/3crj1urn") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/25vxky59") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/258aw9fy") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/3oc7jc1q") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/31ak0z9b") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/hjl7pxfa") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/2qq9dfgc") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/c0pdrw6f") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/5d588f87-fe51-4253-b310-a75fbf8d3702") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/10aa52ac-b2f4-43c4-97f1-4bee57fdab24") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/3500687d-84c6-4cc6-88a9-ac432fe83f42") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/2108ebe3-d55d-418a-9fda-f78a8337909a") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/8853c87c-9bdc-411e-8128-f0976c510485") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/b86adeb5-282b-4f9b-bd4f-361b576c9988") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/6b675ca0-3fed-498a-ae54-e964673158d4") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/48813224-53a2-495e-86a2-d72a5b95ba94") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/996be0e1-ae88-492d-b261-15f034cc6203") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/62bcf801-db79-438e-b0f4-74436f3c67b1") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/ffddfea8-2e9b-493b-88df-04a15f97d7a8") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/b07eb900-8653-4688-a10f-111f3eb3c84a") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/59f743f9-3b3a-4306-83b5-98721508bf2f") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/74a7e942-ca43-44e8-85f7-976fa7dd2edd") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/20425c80-a0f3-4e1a-9991-a85db7012417") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/6fb1e9e2-9485-40c9-ac77-b118cd9cc55b") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/6145c71a-ce9b-4817-bf94-a6eef9b79377") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/c7d59200-7e0f-47a4-846a-123fb23d3c30") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/00379327-06d9-4117-9abb-0f4fef0d6f38") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/dc1c3646-d8fd-4671-b43c-b987441f70cf") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/a58fedea-d5fe-4481-bca4-0e44989f049e") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/05dc4e17-4c73-4f71-b5c3-2eb39aae36c8") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/155aa581-5e1f-4d32-acd5-edde7c5e3c6a") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/5e5e6d1a-59c4-4044-9d50-7d1b920bb626") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/ffb22753-338f-4d2a-ba45-aaeba6a5eed3") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/6d1e8c3f-bd50-4e02-8adc-bf7db13d15ad") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/f99181eb-499d-48be-b1e3-5349f8fe3731") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/fd8b7f41-48cc-4578-8fc8-55ec5e5884df") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/a0a92721-04b1-4868-809e-2ce37358516b") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/c484e9cd-ee4d-427a-941d-80926caa3128") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/5790cb46-ea8c-42b6-abe6-a70faa0f4633") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/ed665d8c-1bb5-4858-9136-574bf523b39a") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/1e6e9a77-a335-41e0-9e29-6271f5a4fcda") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/d6492463-82f1-4512-99fa-b23073d6b418") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/96027203-781b-46ee-bf59-e565227f2f7b") + +collected_urls["algorithm"].append("EDAC") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/d5f5f415-9d1b-4d35-b4e5-c1cf278af46c") + +# +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/3me14n0w") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/8671xq2j") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/3keq4k8a") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("maze2d-large-v1") +collected_urls["url"].append("tlab/CORL/runs/3jq85ti0") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/1vvutaak") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/16nzq1ng") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/3552gil2") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("maze2d-medium-v1") +collected_urls["url"].append("tlab/CORL/runs/3l3dpq11") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/3usi5cuh") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/2vvw9y8h") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/2vcog7cq") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("maze2d-umaze-v1") +collected_urls["url"].append("tlab/CORL/runs/qp93j6we") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/1n8ttdck") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/1bpgemq2") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/39wb3kat") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("halfcheetah-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/w9i9g39x") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/3gfpaz8e") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/3aerk47s") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/275nzj65") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("halfcheetah-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/2fxchaks") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/220xo7sy") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/186848oq") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/2qcui7s9") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("halfcheetah-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/3izk7ats") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/3p8nop3c") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/2n4njt2r") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/cfgxmidd") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("hopper-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/o3jqikii") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/1jg2th4m") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/3qqk3v1v") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/1og7e8w1") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("hopper-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/1hg2vtf9") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/3b6t3c8p") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/i15nczq4") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/3v7jt3p7") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("hopper-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/2uvghydj") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/3v1rznw2") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/2ov8rc9w") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/3funjmu4") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("walker2d-medium-v2") +collected_urls["url"].append("tlab/CORL/runs/3o823qdi") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/21coamdv") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/35cmwtdl") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/3pvuqbr5") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("walker2d-medium-replay-v2") +collected_urls["url"].append("tlab/CORL/runs/ic2e00s6") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/2utgl834") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/3hvawfk9") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/3mo9ld3q") + +collected_urls["algorithm"].append("AWAC") +collected_urls["dataset"].append("walker2d-medium-expert-v2") +collected_urls["url"].append("tlab/CORL/runs/1aihv0tw") + +dataframe = pd.DataFrame(collected_urls) +dataframe.to_csv("runs_tables/offline_urls.csv", index=False) diff --git a/results/runs_tables/finetune_urls.csv b/results/runs_tables/finetune_urls.csv new file mode 100644 index 00000000..3058d67a --- /dev/null +++ b/results/runs_tables/finetune_urls.csv @@ -0,0 +1,201 @@ +algorithm,dataset,url +SPOT,door-cloned-v1,tlab/CORL/runs/eojcj0p8 +SPOT,door-cloned-v1,tlab/CORL/runs/9ckwzft9 +SPOT,door-cloned-v1,tlab/CORL/runs/zgv37urh +SPOT,door-cloned-v1,tlab/CORL/runs/ln7m8q61 +SPOT,hammer-cloned-v1,tlab/CORL/runs/sa3ykqx3 +SPOT,hammer-cloned-v1,tlab/CORL/runs/jd2yz5qp +SPOT,hammer-cloned-v1,tlab/CORL/runs/u9oibqav +SPOT,hammer-cloned-v1,tlab/CORL/runs/lbpcs2k6 +SPOT,pen-cloned-v1,tlab/CORL/runs/wwmuhshh +SPOT,pen-cloned-v1,tlab/CORL/runs/y31k90c8 +SPOT,pen-cloned-v1,tlab/CORL/runs/no9jpmw8 +SPOT,pen-cloned-v1,tlab/CORL/runs/xlja3k66 +SPOT,relocate-cloned-v1,tlab/CORL/runs/sefbgiwe +SPOT,relocate-cloned-v1,tlab/CORL/runs/yhvcqfbu +SPOT,relocate-cloned-v1,tlab/CORL/runs/aafkqjo3 +SPOT,relocate-cloned-v1,tlab/CORL/runs/m8vu8vwt +SPOT,antmaze-umaze-v2,tlab/CORL/runs/sh77hmf8 +SPOT,antmaze-umaze-v2,tlab/CORL/runs/7k84le92 +SPOT,antmaze-umaze-v2,tlab/CORL/runs/k56t0fow +SPOT,antmaze-umaze-v2,tlab/CORL/runs/u734pet4 +SPOT,antmaze-medium-play-v2,tlab/CORL/runs/tm6azb09 +SPOT,antmaze-medium-play-v2,tlab/CORL/runs/js6p40n7 +SPOT,antmaze-medium-play-v2,tlab/CORL/runs/jk787i4r +SPOT,antmaze-medium-play-v2,tlab/CORL/runs/njjgya5v +SPOT,antmaze-umaze-diverse-v2,tlab/CORL/runs/ky6ge6zd +SPOT,antmaze-umaze-diverse-v2,tlab/CORL/runs/s3hfrsrr +SPOT,antmaze-umaze-diverse-v2,tlab/CORL/runs/yj56lahi +SPOT,antmaze-umaze-diverse-v2,tlab/CORL/runs/csa2jjkd +SPOT,antmaze-large-diverse-v2,tlab/CORL/runs/0sd0xemj +SPOT,antmaze-large-diverse-v2,tlab/CORL/runs/0kn4pl04 +SPOT,antmaze-large-diverse-v2,tlab/CORL/runs/onon4kbo +SPOT,antmaze-large-play-v2,tlab/CORL/runs/5ldclyhi +SPOT,antmaze-large-play-v2,tlab/CORL/runs/v8uskc0k +SPOT,antmaze-medium-diverse-v2,tlab/CORL/runs/sysutdr0 +SPOT,antmaze-medium-diverse-v2,tlab/CORL/runs/tnksp757 +SPOT,antmaze-medium-diverse-v2,tlab/CORL/runs/cg7vkg7p +SPOT,antmaze-medium-diverse-v2,tlab/CORL/runs/12ivynlo +SPOT,antmaze-large-diverse-v2,tlab/CORL/runs/kvxnj3cw +SPOT,antmaze-large-play-v2,tlab/CORL/runs/h2sq31u7 +SPOT,antmaze-large-play-v2,tlab/CORL/runs/hp23le7j +AWAC,door-cloned-v1,tlab/CORL/runs/opk315cr +AWAC,door-cloned-v1,tlab/CORL/runs/ty7v9xmw +AWAC,door-cloned-v1,tlab/CORL/runs/87tkm1gd +AWAC,door-cloned-v1,tlab/CORL/runs/z6ufsg2k +AWAC,hammer-cloned-v1,tlab/CORL/runs/6ljwk1po +AWAC,hammer-cloned-v1,tlab/CORL/runs/cf7qx9hk +AWAC,hammer-cloned-v1,tlab/CORL/runs/annl41zp +AWAC,hammer-cloned-v1,tlab/CORL/runs/p9ajfr0x +AWAC,pen-cloned-v1,tlab/CORL/runs/60sm3kj9 +AWAC,pen-cloned-v1,tlab/CORL/runs/euosw3ed +AWAC,pen-cloned-v1,tlab/CORL/runs/dha1t6l2 +AWAC,pen-cloned-v1,tlab/CORL/runs/6xib4k09 +AWAC,relocate-cloned-v1,tlab/CORL/runs/vveuyu6c +AWAC,relocate-cloned-v1,tlab/CORL/runs/rmo3dlge +AWAC,relocate-cloned-v1,tlab/CORL/runs/nk2js2h7 +AWAC,relocate-cloned-v1,tlab/CORL/runs/wau1sbk4 +AWAC,antmaze-umaze-v2,tlab/CORL/runs/xxge9l2q +AWAC,antmaze-umaze-v2,tlab/CORL/runs/r6kduddm +AWAC,antmaze-umaze-v2,tlab/CORL/runs/pamlqf0k +AWAC,antmaze-umaze-v2,tlab/CORL/runs/w46o6pvd +AWAC,antmaze-medium-play-v2,tlab/CORL/runs/of5wccnb +AWAC,antmaze-medium-play-v2,tlab/CORL/runs/ot2pz1qd +AWAC,antmaze-medium-play-v2,tlab/CORL/runs/16zrj330 +AWAC,antmaze-medium-play-v2,tlab/CORL/runs/nzlutwqz +AWAC,antmaze-umaze-diverse-v2,tlab/CORL/runs/esv2fp90 +AWAC,antmaze-umaze-diverse-v2,tlab/CORL/runs/9qe7mk9a +AWAC,antmaze-umaze-diverse-v2,tlab/CORL/runs/49g1f0by +AWAC,antmaze-umaze-diverse-v2,tlab/CORL/runs/h870x3bk +AWAC,antmaze-large-diverse-v2,tlab/CORL/runs/8xs0kekx +AWAC,antmaze-large-diverse-v2,tlab/CORL/runs/wuvlj00x +AWAC,antmaze-large-diverse-v2,tlab/CORL/runs/ntpk9eke +AWAC,antmaze-large-diverse-v2,tlab/CORL/runs/vtqqrmrb +AWAC,antmaze-large-play-v2,tlab/CORL/runs/z88yrg2k +AWAC,antmaze-large-play-v2,tlab/CORL/runs/b1e1up19 +AWAC,antmaze-large-play-v2,tlab/CORL/runs/bpq150gg +AWAC,antmaze-large-play-v2,tlab/CORL/runs/rffmurq2 +AWAC,antmaze-medium-diverse-v2,tlab/CORL/runs/tiq65215 +AWAC,antmaze-medium-diverse-v2,tlab/CORL/runs/uo63dgzx +AWAC,antmaze-medium-diverse-v2,tlab/CORL/runs/mfjh57xv +AWAC,antmaze-medium-diverse-v2,tlab/CORL/runs/325fy2js +CQL,antmaze-umaze-v2,tlab/CORL/runs/vdh2wmw9 +CQL,antmaze-umaze-v2,tlab/CORL/runs/wh27aupq +CQL,antmaze-umaze-v2,tlab/CORL/runs/7r4uwutz +CQL,antmaze-large-play-v2,tlab/CORL/runs/kt7jwqcz +CQL,antmaze-umaze-v2,tlab/CORL/runs/l5xvgwt4 +CQL,antmaze-large-play-v2,tlab/CORL/runs/8fm40vpm +CQL,antmaze-large-play-v2,tlab/CORL/runs/yeax28su +CQL,antmaze-medium-diverse-v2,tlab/CORL/runs/gvhslqyo +CQL,antmaze-medium-diverse-v2,tlab/CORL/runs/mowkqr6u +CQL,antmaze-medium-diverse-v2,tlab/CORL/runs/pswhm9pi +CQL,antmaze-medium-diverse-v2,tlab/CORL/runs/hh5vv5qc +CQL,door-cloned-v1,tlab/CORL/runs/2xc0y5sd +CQL,door-cloned-v1,tlab/CORL/runs/2ylul8yr +CQL,door-cloned-v1,tlab/CORL/runs/lficzs9q +CQL,door-cloned-v1,tlab/CORL/runs/1q7fleea +CQL,hammer-cloned-v1,tlab/CORL/runs/ai7uzyue +CQL,hammer-cloned-v1,tlab/CORL/runs/pfg12jkr +CQL,hammer-cloned-v1,tlab/CORL/runs/peg6p1tn +CQL,hammer-cloned-v1,tlab/CORL/runs/5b5gfn29 +CQL,pen-cloned-v1,tlab/CORL/runs/skfco0f3 +CQL,pen-cloned-v1,tlab/CORL/runs/y1xn14fc +CQL,pen-cloned-v1,tlab/CORL/runs/ko7bpkib +CQL,pen-cloned-v1,tlab/CORL/runs/k03ln3jt +CQL,relocate-cloned-v1,tlab/CORL/runs/p1ldsu7x +CQL,relocate-cloned-v1,tlab/CORL/runs/kx2c5wed +CQL,relocate-cloned-v1,tlab/CORL/runs/vigvty09 +CQL,relocate-cloned-v1,tlab/CORL/runs/m4zo4sd9 +CQL,antmaze-medium-play-v2,tlab/CORL/runs/07uz9o1u +CQL,antmaze-medium-play-v2,tlab/CORL/runs/z5mb7tct +CQL,antmaze-medium-play-v2,tlab/CORL/runs/y0qkgbhl +CQL,antmaze-medium-play-v2,tlab/CORL/runs/vgj3kc4h +CQL,antmaze-umaze-diverse-v2,tlab/CORL/runs/t8n4l1rz +CQL,antmaze-umaze-diverse-v2,tlab/CORL/runs/jcqp22ib +CQL,antmaze-umaze-diverse-v2,tlab/CORL/runs/g16g99mu +CQL,antmaze-umaze-diverse-v2,tlab/CORL/runs/53ud2dfw +CQL,antmaze-large-diverse-v2,tlab/CORL/runs/o2egtkpf +CQL,antmaze-large-diverse-v2,tlab/CORL/runs/iwywzz0e +CQL,antmaze-large-diverse-v2,tlab/CORL/runs/58n8wj48 +CQL,antmaze-large-diverse-v2,tlab/CORL/runs/merjc5q4 +CQL,antmaze-large-play-v2,tlab/CORL/runs/s1ttvc6h +IQL,door-cloned-v1,tlab/CORL/runs/28r3hs26 +IQL,door-cloned-v1,tlab/CORL/runs/wrppy19y +IQL,door-cloned-v1,tlab/CORL/runs/x28mew02 +IQL,door-cloned-v1,tlab/CORL/runs/m8i3wogb +IQL,hammer-cloned-v1,tlab/CORL/runs/7u30km42 +IQL,hammer-cloned-v1,tlab/CORL/runs/bvdezudo +IQL,hammer-cloned-v1,tlab/CORL/runs/ui1eca3a +IQL,hammer-cloned-v1,tlab/CORL/runs/l0k5d09o +IQL,pen-cloned-v1,tlab/CORL/runs/98jdjivj +IQL,pen-cloned-v1,tlab/CORL/runs/aeeqif54 +IQL,pen-cloned-v1,tlab/CORL/runs/7fbjbh7u +IQL,pen-cloned-v1,tlab/CORL/runs/xarrd2e5 +IQL,relocate-cloned-v1,tlab/CORL/runs/tp5s7tvf +IQL,relocate-cloned-v1,tlab/CORL/runs/t3e975k9 +IQL,relocate-cloned-v1,tlab/CORL/runs/a3hj1pc8 +IQL,relocate-cloned-v1,tlab/CORL/runs/cxoqai00 +IQL,antmaze-umaze-v2,tlab/CORL/runs/nhip1045 +IQL,antmaze-umaze-v2,tlab/CORL/runs/3vk4iyrd +IQL,antmaze-umaze-v2,tlab/CORL/runs/akdgnyqy +IQL,antmaze-umaze-v2,tlab/CORL/runs/oxhbok2v +IQL,antmaze-medium-play-v2,tlab/CORL/runs/cpi6g8lw +IQL,antmaze-medium-play-v2,tlab/CORL/runs/yqckdfax +IQL,antmaze-medium-play-v2,tlab/CORL/runs/bxojsnny +IQL,antmaze-medium-play-v2,tlab/CORL/runs/wi2ipkzl +IQL,antmaze-umaze-diverse-v2,tlab/CORL/runs/xvtfkaw5 +IQL,antmaze-umaze-diverse-v2,tlab/CORL/runs/cnp2e8wm +IQL,antmaze-umaze-diverse-v2,tlab/CORL/runs/4o8tdzv4 +IQL,antmaze-umaze-diverse-v2,tlab/CORL/runs/a2jrnzjz +IQL,antmaze-large-diverse-v2,tlab/CORL/runs/mh1khiw7 +IQL,antmaze-large-diverse-v2,tlab/CORL/runs/uja1k96c +IQL,antmaze-large-diverse-v2,tlab/CORL/runs/8b6zwm6w +IQL,antmaze-large-diverse-v2,tlab/CORL/runs/3k3o8jak +IQL,antmaze-large-play-v2,tlab/CORL/runs/i40zkkb5 +IQL,antmaze-large-play-v2,tlab/CORL/runs/4ecawkom +IQL,antmaze-large-play-v2,tlab/CORL/runs/8cg1n28z +IQL,antmaze-large-play-v2,tlab/CORL/runs/s88ixn7g +IQL,antmaze-medium-diverse-v2,tlab/CORL/runs/vaa56ykf +IQL,antmaze-medium-diverse-v2,tlab/CORL/runs/van7r2au +IQL,antmaze-medium-diverse-v2,tlab/CORL/runs/gifi8vh6 +IQL,antmaze-medium-diverse-v2,tlab/CORL/runs/8y5gwfhm +Cal-QL,door-cloned-v1,tlab/CORL/runs/oi1ig0ri +Cal-QL,door-cloned-v1,tlab/CORL/runs/i069hyd7 +Cal-QL,door-cloned-v1,tlab/CORL/runs/rhhdlroq +Cal-QL,door-cloned-v1,tlab/CORL/runs/eicij2jh +Cal-QL,hammer-cloned-v1,tlab/CORL/runs/kusvjf0g +Cal-QL,hammer-cloned-v1,tlab/CORL/runs/1lqi4sg9 +Cal-QL,hammer-cloned-v1,tlab/CORL/runs/2fu95t4k +Cal-QL,hammer-cloned-v1,tlab/CORL/runs/7wkikqpn +Cal-QL,pen-cloned-v1,tlab/CORL/runs/csoban2m +Cal-QL,pen-cloned-v1,tlab/CORL/runs/fj45ivs8 +Cal-QL,pen-cloned-v1,tlab/CORL/runs/o0y2q02v +Cal-QL,pen-cloned-v1,tlab/CORL/runs/hzq011ab +Cal-QL,relocate-cloned-v1,tlab/CORL/runs/c1csqi8s +Cal-QL,relocate-cloned-v1,tlab/CORL/runs/30r23nbv +Cal-QL,relocate-cloned-v1,tlab/CORL/runs/ywe1cfqa +Cal-QL,relocate-cloned-v1,tlab/CORL/runs/kc7mgqh5 +Cal-QL,antmaze-umaze-v2,tlab/CORL/runs/d5f3ul52 +Cal-QL,antmaze-umaze-v2,tlab/CORL/runs/fjsryl4k +Cal-QL,antmaze-umaze-v2,tlab/CORL/runs/z781tlua +Cal-QL,antmaze-umaze-v2,tlab/CORL/runs/mbpoixey +Cal-QL,antmaze-medium-play-v2,tlab/CORL/runs/d2gndjad +Cal-QL,antmaze-medium-play-v2,tlab/CORL/runs/kqxyllfa +Cal-QL,antmaze-medium-play-v2,tlab/CORL/runs/qaowm0ds +Cal-QL,antmaze-medium-play-v2,tlab/CORL/runs/ybpehr4w +Cal-QL,antmaze-umaze-diverse-v2,tlab/CORL/runs/xamd4zxj +Cal-QL,antmaze-umaze-diverse-v2,tlab/CORL/runs/a015fjb1 +Cal-QL,antmaze-umaze-diverse-v2,tlab/CORL/runs/1pu06s2i +Cal-QL,antmaze-umaze-diverse-v2,tlab/CORL/runs/iwa1o31k +Cal-QL,antmaze-large-diverse-v2,tlab/CORL/runs/yvqv3mxa +Cal-QL,antmaze-large-diverse-v2,tlab/CORL/runs/4myjeu5g +Cal-QL,antmaze-large-diverse-v2,tlab/CORL/runs/6ptdr78l +Cal-QL,antmaze-large-diverse-v2,tlab/CORL/runs/8ix0469p +Cal-QL,antmaze-large-play-v2,tlab/CORL/runs/4chdwkua +Cal-QL,antmaze-large-play-v2,tlab/CORL/runs/fzrlcnwp +Cal-QL,antmaze-large-play-v2,tlab/CORL/runs/f9hz4fal +Cal-QL,antmaze-large-play-v2,tlab/CORL/runs/fpq2ob8q +Cal-QL,antmaze-medium-diverse-v2,tlab/CORL/runs/zhf7tr7p +Cal-QL,antmaze-medium-diverse-v2,tlab/CORL/runs/m02ew5oy +Cal-QL,antmaze-medium-diverse-v2,tlab/CORL/runs/9r1a0trx +Cal-QL,antmaze-medium-diverse-v2,tlab/CORL/runs/ds2dbx2u diff --git a/results/runs_tables/offline_urls.csv b/results/runs_tables/offline_urls.csv new file mode 100644 index 00000000..6877e2de --- /dev/null +++ b/results/runs_tables/offline_urls.csv @@ -0,0 +1,1201 @@ +algorithm,dataset,url +BC,antmaze-large-play-v2,tlab/CORL/runs/kwy58i15 +BC,antmaze-large-play-v2,tlab/CORL/runs/janur3zl +BC,antmaze-large-play-v2,tlab/CORL/runs/vlwc7kyk +BC,antmaze-large-play-v2,tlab/CORL/runs/p1gn517n +BC,antmaze-medium-play-v2,tlab/CORL/runs/25ik3239 +BC,antmaze-medium-play-v2,tlab/CORL/runs/j7e7s4b1 +BC,antmaze-medium-play-v2,tlab/CORL/runs/df4igrjn +BC,antmaze-medium-play-v2,tlab/CORL/runs/1o9e1g0n +BC,antmaze-umaze-v2,tlab/CORL/runs/nfizxigf +BC,antmaze-umaze-v2,tlab/CORL/runs/zzxha1ci +BC,antmaze-umaze-v2,tlab/CORL/runs/1ocsiue4 +BC,antmaze-umaze-v2,tlab/CORL/runs/eyrpur2p +10% BC,antmaze-large-play-v2,tlab/CORL/runs/wiybqzr0 +10% BC,antmaze-large-play-v2,tlab/CORL/runs/nlgiz0xi +10% BC,antmaze-large-play-v2,tlab/CORL/runs/uwb29xft +10% BC,antmaze-large-play-v2,tlab/CORL/runs/eahx2chh +10% BC,antmaze-medium-play-v2,tlab/CORL/runs/v82fk13n +10% BC,antmaze-medium-play-v2,tlab/CORL/runs/awzwjsgt +10% BC,antmaze-medium-play-v2,tlab/CORL/runs/85s2uzh7 +10% BC,antmaze-medium-play-v2,tlab/CORL/runs/fch91ou7 +10% BC,antmaze-umaze-v2,tlab/CORL/runs/p93gul6e +10% BC,antmaze-umaze-v2,tlab/CORL/runs/8suja0ax +10% BC,antmaze-umaze-v2,tlab/CORL/runs/ps7vb7qq +10% BC,antmaze-umaze-v2,tlab/CORL/runs/svd9puwm +AWAC,antmaze-large-play-v2,tlab/CORL/runs/gqsknjrw +AWAC,antmaze-large-play-v2,tlab/CORL/runs/uz1mui4r +AWAC,antmaze-large-play-v2,tlab/CORL/runs/5wiafkmp +AWAC,antmaze-large-play-v2,tlab/CORL/runs/0iexkmw7 +AWAC,antmaze-medium-play-v2,tlab/CORL/runs/2z2eoj8k +AWAC,antmaze-medium-play-v2,tlab/CORL/runs/8go012ev +AWAC,antmaze-medium-play-v2,tlab/CORL/runs/55m764cg +AWAC,antmaze-medium-play-v2,tlab/CORL/runs/8rsav8uu +AWAC,antmaze-umaze-v2,tlab/CORL/runs/eoktum38 +AWAC,antmaze-umaze-v2,tlab/CORL/runs/z8f19doo +AWAC,antmaze-umaze-v2,tlab/CORL/runs/dfr5z3sl +AWAC,antmaze-umaze-v2,tlab/CORL/runs/dg4hto72 +TD3+BC,antmaze-large-play-v2,tlab/CORL/runs/rk6gyy0i +TD3+BC,antmaze-large-play-v2,tlab/CORL/runs/hije3v88 +TD3+BC,antmaze-large-play-v2,tlab/CORL/runs/7z7wn6ai +TD3+BC,antmaze-large-play-v2,tlab/CORL/runs/swkgdla8 +TD3+BC,antmaze-medium-play-v2,tlab/CORL/runs/w2ri0chh +TD3+BC,antmaze-medium-play-v2,tlab/CORL/runs/3tunu7aj +TD3+BC,antmaze-medium-play-v2,tlab/CORL/runs/mmfrps5m +TD3+BC,antmaze-medium-play-v2,tlab/CORL/runs/y933egvk +TD3+BC,antmaze-umaze-v2,tlab/CORL/runs/7g6nfjbd +TD3+BC,antmaze-umaze-v2,tlab/CORL/runs/9h68rgtr +TD3+BC,antmaze-umaze-v2,tlab/CORL/runs/wuhzz3fr +TD3+BC,antmaze-umaze-v2,tlab/CORL/runs/fbdwb1g5 +IQL,maze2d-large-v1,tlab/CORL/runs/paqc6n7r +IQL,maze2d-large-v1,tlab/CORL/runs/t161ys9d +IQL,maze2d-large-v1,tlab/CORL/runs/3thh6k1c +IQL,maze2d-large-v1,tlab/CORL/runs/pptelbzd +IQL,maze2d-medium-v1,tlab/CORL/runs/y30a0o1o +IQL,maze2d-medium-v1,tlab/CORL/runs/g9hwgpsz +IQL,maze2d-medium-v1,tlab/CORL/runs/exk1h7rd +IQL,maze2d-medium-v1,tlab/CORL/runs/hqic3d7j +IQL,maze2d-umaze-v1,tlab/CORL/runs/fe4leetd +IQL,maze2d-umaze-v1,tlab/CORL/runs/1rok4pj1 +IQL,maze2d-umaze-v1,tlab/CORL/runs/cpjf024a +IQL,maze2d-umaze-v1,tlab/CORL/runs/w15rt35b +IQL,walker2d-medium-expert-v2,tlab/CORL/runs/4j6b4kxv +IQL,walker2d-medium-expert-v2,tlab/CORL/runs/ctpdqlh9 +IQL,walker2d-medium-expert-v2,tlab/CORL/runs/7cg28cul +IQL,walker2d-medium-expert-v2,tlab/CORL/runs/u60j7bd1 +IQL,walker2d-medium-replay-v2,tlab/CORL/runs/zq3c29cs +IQL,walker2d-medium-replay-v2,tlab/CORL/runs/clwpbg8k +IQL,walker2d-medium-replay-v2,tlab/CORL/runs/2pqh2b99 +IQL,walker2d-medium-replay-v2,tlab/CORL/runs/s0woblol +IQL,walker2d-medium-v2,tlab/CORL/runs/362pdyn5 +IQL,walker2d-medium-v2,tlab/CORL/runs/u44ct6p9 +IQL,walker2d-medium-v2,tlab/CORL/runs/qa14osol +IQL,walker2d-medium-v2,tlab/CORL/runs/oezars9l +IQL,hopper-medium-expert-v2,tlab/CORL/runs/t9j46dlk +IQL,hopper-medium-expert-v2,tlab/CORL/runs/03m68srt +IQL,hopper-medium-expert-v2,tlab/CORL/runs/uuzlo61d +IQL,hopper-medium-expert-v2,tlab/CORL/runs/5v5gdpty +IQL,hopper-medium-replay-v2,tlab/CORL/runs/vbxujllv +IQL,hopper-medium-replay-v2,tlab/CORL/runs/3clw63s0 +IQL,hopper-medium-replay-v2,tlab/CORL/runs/3nih2mzr +IQL,hopper-medium-replay-v2,tlab/CORL/runs/x5ye4zkg +IQL,hopper-medium-v2,tlab/CORL/runs/uu3c4n0a +IQL,hopper-medium-v2,tlab/CORL/runs/j5l3lyj4 +IQL,hopper-medium-v2,tlab/CORL/runs/zqtfyvf2 +IQL,hopper-medium-v2,tlab/CORL/runs/1aemqlfi +IQL,halfcheetah-medium-expert-v2,tlab/CORL/runs/oh266eu2 +IQL,halfcheetah-medium-expert-v2,tlab/CORL/runs/bp2ta12i +IQL,halfcheetah-medium-expert-v2,tlab/CORL/runs/c6wifr4s +IQL,halfcheetah-medium-expert-v2,tlab/CORL/runs/2h3hitjr +IQL,halfcheetah-medium-replay-v2,tlab/CORL/runs/dt4zqlzc +IQL,halfcheetah-medium-replay-v2,tlab/CORL/runs/5h85xd40 +IQL,halfcheetah-medium-replay-v2,tlab/CORL/runs/dypzd8f1 +IQL,halfcheetah-medium-replay-v2,tlab/CORL/runs/iitkz6fr +IQL,halfcheetah-medium-v2,tlab/CORL/runs/o3xv866d +IQL,halfcheetah-medium-v2,tlab/CORL/runs/belph12a +IQL,halfcheetah-medium-v2,tlab/CORL/runs/3izbg06r +IQL,halfcheetah-medium-v2,tlab/CORL/runs/o3n15zpx +IQL,antmaze-large-play-v2,tlab/CORL/runs/2shfj9a9 +IQL,antmaze-large-play-v2,tlab/CORL/runs/wb0s3i24 +IQL,antmaze-large-play-v2,tlab/CORL/runs/3y1ypq9l +IQL,antmaze-large-play-v2,tlab/CORL/runs/i3yp3e4k +IQL,antmaze-medium-play-v2,tlab/CORL/runs/mq2j0yy8 +IQL,antmaze-medium-play-v2,tlab/CORL/runs/d0kh6bq3 +IQL,antmaze-medium-play-v2,tlab/CORL/runs/06vntt7w +IQL,antmaze-medium-play-v2,tlab/CORL/runs/bb891xsq +IQL,antmaze-umaze-v2,tlab/CORL/runs/nqjjq5yu +IQL,antmaze-umaze-v2,tlab/CORL/runs/6fcav9y4 +IQL,antmaze-umaze-v2,tlab/CORL/runs/lr664dj2 +IQL,antmaze-umaze-v2,tlab/CORL/runs/qsznzxyg +DT,antmaze-large-play-v2,tlab/CORL/runs/4nphubu5 +DT,antmaze-large-play-v2,tlab/CORL/runs/mi933yoo +DT,antmaze-large-play-v2,tlab/CORL/runs/my4eb5rx +DT,antmaze-large-play-v2,tlab/CORL/runs/35w8s54f +DT,antmaze-medium-play-v2,tlab/CORL/runs/vb4rxa0n +DT,antmaze-medium-play-v2,tlab/CORL/runs/jpicssa5 +DT,antmaze-medium-play-v2,tlab/CORL/runs/hhmrsjan +DT,antmaze-medium-play-v2,tlab/CORL/runs/s0vm7v3p +DT,antmaze-umaze-v2,tlab/CORL/runs/w9xsjej5 +DT,antmaze-umaze-v2,tlab/CORL/runs/vjuin367 +DT,antmaze-umaze-v2,tlab/CORL/runs/v3gg1r65 +DT,antmaze-umaze-v2,tlab/CORL/runs/nac148qy +SAC-N,antmaze-large-play-v2,tlab/CORL/runs/ar8vd36z +SAC-N,antmaze-large-play-v2,tlab/CORL/runs/tsihyyyt +SAC-N,antmaze-large-play-v2,tlab/CORL/runs/ailbgx6w +SAC-N,antmaze-large-play-v2,tlab/CORL/runs/i952288w +SAC-N,antmaze-medium-play-v2,tlab/CORL/runs/uu0ccdnv +SAC-N,antmaze-medium-play-v2,tlab/CORL/runs/emkrjuku +SAC-N,antmaze-medium-play-v2,tlab/CORL/runs/qeknz9xa +SAC-N,antmaze-medium-play-v2,tlab/CORL/runs/bxne7xf4 +SAC-N,antmaze-umaze-v2,tlab/CORL/runs/b5oigkoe +SAC-N,antmaze-umaze-v2,tlab/CORL/runs/i6cibz6y +SAC-N,antmaze-umaze-v2,tlab/CORL/runs/r8heznya +SAC-N,antmaze-umaze-v2,tlab/CORL/runs/jlay7xyn +EDAC,antmaze-large-play-v2,tlab/CORL/runs/3f1hugcx +EDAC,antmaze-large-play-v2,tlab/CORL/runs/wn40kcqj +EDAC,antmaze-large-play-v2,tlab/CORL/runs/u5aeouah +EDAC,antmaze-large-play-v2,tlab/CORL/runs/061l8t9g +EDAC,antmaze-medium-play-v2,tlab/CORL/runs/fefui203 +EDAC,antmaze-medium-play-v2,tlab/CORL/runs/lwqa698k +EDAC,antmaze-medium-play-v2,tlab/CORL/runs/aj12ivh5 +EDAC,antmaze-medium-play-v2,tlab/CORL/runs/j8drml0r +EDAC,antmaze-umaze-v2,tlab/CORL/runs/9q7ycf99 +EDAC,antmaze-umaze-v2,tlab/CORL/runs/mhnl5d2u +EDAC,antmaze-umaze-v2,tlab/CORL/runs/nv1ncdv1 +EDAC,antmaze-umaze-v2,tlab/CORL/runs/ix5ky4oz +10% BC,antmaze-large-diverse-v2,tlab/CORL/runs/x2s54uam +10% BC,antmaze-large-diverse-v2,tlab/CORL/runs/176xlvpk +10% BC,antmaze-large-diverse-v2,tlab/CORL/runs/3vpeous1 +10% BC,antmaze-large-diverse-v2,tlab/CORL/runs/hwjcscht +10% BC,antmaze-medium-diverse-v2,tlab/CORL/runs/8kttz1sn +10% BC,antmaze-medium-diverse-v2,tlab/CORL/runs/ff25hg32 +10% BC,antmaze-medium-diverse-v2,tlab/CORL/runs/78amw82p +10% BC,antmaze-medium-diverse-v2,tlab/CORL/runs/cpjh73vs +10% BC,antmaze-umaze-diverse-v2,tlab/CORL/runs/hp0bkr3b +10% BC,antmaze-umaze-diverse-v2,tlab/CORL/runs/3ipfrw0t +10% BC,antmaze-umaze-diverse-v2,tlab/CORL/runs/fro1zetk +10% BC,antmaze-umaze-diverse-v2,tlab/CORL/runs/fn2ayr34 +BC,antmaze-large-diverse-v2,tlab/CORL/runs/q1qo78cv +BC,antmaze-large-diverse-v2,tlab/CORL/runs/pi0kt1h0 +BC,antmaze-large-diverse-v2,tlab/CORL/runs/i1m7zlc7 +BC,antmaze-large-diverse-v2,tlab/CORL/runs/82lwzknn +BC,antmaze-medium-diverse-v2,tlab/CORL/runs/7abomo1w +BC,antmaze-medium-diverse-v2,tlab/CORL/runs/lp1mc9wm +BC,antmaze-medium-diverse-v2,tlab/CORL/runs/yqbyo6si +BC,antmaze-medium-diverse-v2,tlab/CORL/runs/uu0owh56 +BC,antmaze-umaze-diverse-v2,tlab/CORL/runs/k1gm3rmp +BC,antmaze-umaze-diverse-v2,tlab/CORL/runs/dupp1690 +BC,antmaze-umaze-diverse-v2,tlab/CORL/runs/p1pfswg5 +BC,antmaze-umaze-diverse-v2,tlab/CORL/runs/ymfr22je +AWAC,antmaze-large-diverse-v2,tlab/CORL/runs/dscbc6bc +AWAC,antmaze-large-diverse-v2,tlab/CORL/runs/z156oi6e +AWAC,antmaze-large-diverse-v2,tlab/CORL/runs/qre0pruu +AWAC,antmaze-large-diverse-v2,tlab/CORL/runs/xdypawgj +AWAC,antmaze-medium-diverse-v2,tlab/CORL/runs/abf0hr8n +AWAC,antmaze-medium-diverse-v2,tlab/CORL/runs/re1tsj7q +AWAC,antmaze-medium-diverse-v2,tlab/CORL/runs/xqp0p1tj +AWAC,antmaze-medium-diverse-v2,tlab/CORL/runs/t5i68m4m +AWAC,antmaze-umaze-diverse-v2,tlab/CORL/runs/xuklmxcl +AWAC,antmaze-umaze-diverse-v2,tlab/CORL/runs/ci1ipp6g +AWAC,antmaze-umaze-diverse-v2,tlab/CORL/runs/59i07yhh +AWAC,antmaze-umaze-diverse-v2,tlab/CORL/runs/sgl07j7a +SAC-N,antmaze-large-diverse-v2,tlab/CORL/runs/k7ibnxhy +SAC-N,antmaze-large-diverse-v2,tlab/CORL/runs/pdx9por7 +SAC-N,antmaze-large-diverse-v2,tlab/CORL/runs/d38mt6p0 +SAC-N,antmaze-large-diverse-v2,tlab/CORL/runs/95oo01bk +SAC-N,antmaze-medium-diverse-v2,tlab/CORL/runs/k97b6k5u +SAC-N,antmaze-medium-diverse-v2,tlab/CORL/runs/9p5pfuf5 +SAC-N,antmaze-medium-diverse-v2,tlab/CORL/runs/crh0a497 +SAC-N,antmaze-medium-diverse-v2,tlab/CORL/runs/93kw5nzv +SAC-N,antmaze-umaze-diverse-v2,tlab/CORL/runs/8afnikfe +SAC-N,antmaze-umaze-diverse-v2,tlab/CORL/runs/9vt0ti5k +SAC-N,antmaze-umaze-diverse-v2,tlab/CORL/runs/1k9vjghg +SAC-N,antmaze-umaze-diverse-v2,tlab/CORL/runs/ovvavzxb +EDAC,antmaze-large-diverse-v2,tlab/CORL/runs/v25gsp4y +EDAC,antmaze-large-diverse-v2,tlab/CORL/runs/v3xh5etj +EDAC,antmaze-large-diverse-v2,tlab/CORL/runs/ue4f3kbw +EDAC,antmaze-large-diverse-v2,tlab/CORL/runs/ajal32lc +EDAC,antmaze-medium-diverse-v2,tlab/CORL/runs/hubnt7ft +EDAC,antmaze-medium-diverse-v2,tlab/CORL/runs/g5q10m44 +EDAC,antmaze-medium-diverse-v2,tlab/CORL/runs/vgu6oirc +EDAC,antmaze-medium-diverse-v2,tlab/CORL/runs/8e546u0p +EDAC,antmaze-umaze-diverse-v2,tlab/CORL/runs/5ro1a79t +EDAC,antmaze-umaze-diverse-v2,tlab/CORL/runs/ox1pady5 +EDAC,antmaze-umaze-diverse-v2,tlab/CORL/runs/rvpporaw +EDAC,antmaze-umaze-diverse-v2,tlab/CORL/runs/69z5r1tl +IQL,antmaze-large-diverse-v2,tlab/CORL/runs/p91kz1k1 +IQL,antmaze-large-diverse-v2,tlab/CORL/runs/0a7zgwep +IQL,antmaze-large-diverse-v2,tlab/CORL/runs/yxhl3ws5 +IQL,antmaze-large-diverse-v2,tlab/CORL/runs/l6d643mk +IQL,antmaze-medium-diverse-v2,tlab/CORL/runs/pby1ya7l +IQL,antmaze-medium-diverse-v2,tlab/CORL/runs/5qcqqkyr +IQL,antmaze-medium-diverse-v2,tlab/CORL/runs/pslw6kyn +IQL,antmaze-umaze-diverse-v2,tlab/CORL/runs/d1bhjk3p +IQL,antmaze-medium-diverse-v2,tlab/CORL/runs/zq0xgzgn +IQL,antmaze-umaze-diverse-v2,tlab/CORL/runs/arnw9pcg +IQL,antmaze-umaze-diverse-v2,tlab/CORL/runs/ey8pxi46 +IQL,antmaze-umaze-diverse-v2,tlab/CORL/runs/63j9ime2 +TD3+BC,antmaze-large-diverse-v2,tlab/CORL/runs/q5hxh023 +TD3+BC,antmaze-large-diverse-v2,tlab/CORL/runs/rlreidub +TD3+BC,antmaze-large-diverse-v2,tlab/CORL/runs/vr1506ft +TD3+BC,antmaze-large-diverse-v2,tlab/CORL/runs/x6pfzk5h +TD3+BC,antmaze-medium-diverse-v2,tlab/CORL/runs/k9nl1lqo +TD3+BC,antmaze-medium-diverse-v2,tlab/CORL/runs/0lxwc925 +TD3+BC,antmaze-medium-diverse-v2,tlab/CORL/runs/6tejd5kf +TD3+BC,antmaze-medium-diverse-v2,tlab/CORL/runs/xzpq4u3k +TD3+BC,antmaze-umaze-diverse-v2,tlab/CORL/runs/ulftdkfk +TD3+BC,antmaze-umaze-diverse-v2,tlab/CORL/runs/3ahhw3q0 +TD3+BC,antmaze-umaze-diverse-v2,tlab/CORL/runs/h9pxlnai +TD3+BC,antmaze-umaze-diverse-v2,tlab/CORL/runs/o6zv6o09 +DT,antmaze-large-diverse-v2,tlab/CORL/runs/u6wabr2z +DT,antmaze-large-diverse-v2,tlab/CORL/runs/rrcdzrq7 +DT,antmaze-large-diverse-v2,tlab/CORL/runs/w8yi7qjn +DT,antmaze-large-diverse-v2,tlab/CORL/runs/3tot8bwr +DT,antmaze-medium-diverse-v2,tlab/CORL/runs/s9qb61hk +DT,antmaze-medium-diverse-v2,tlab/CORL/runs/7qkjasye +DT,antmaze-medium-diverse-v2,tlab/CORL/runs/18ps31gs +DT,antmaze-medium-diverse-v2,tlab/CORL/runs/upc73fx3 +DT,antmaze-umaze-diverse-v2,tlab/CORL/runs/et3nrjye +DT,antmaze-umaze-diverse-v2,tlab/CORL/runs/wwi8jedd +DT,antmaze-umaze-diverse-v2,tlab/CORL/runs/8m8ffzfl +DT,antmaze-umaze-diverse-v2,tlab/CORL/runs/7abiphxz +IQL,relocate-human-v1,tlab/CORL/runs/i2s2j8i5 +IQL,relocate-human-v1,tlab/CORL/runs/rngiqdvr +IQL,relocate-human-v1,tlab/CORL/runs/5u64dj8e +IQL,relocate-human-v1,tlab/CORL/runs/zcq9utiz +IQL,relocate-expert-v1,tlab/CORL/runs/388v30kc +IQL,relocate-expert-v1,tlab/CORL/runs/qzg6j3y1 +IQL,relocate-expert-v1,tlab/CORL/runs/gl3esr43 +IQL,relocate-expert-v1,tlab/CORL/runs/59o0u5g2 +IQL,relocate-cloned-v1,tlab/CORL/runs/l566fh6t +IQL,relocate-cloned-v1,tlab/CORL/runs/cbgjupnb +IQL,relocate-cloned-v1,tlab/CORL/runs/o0is0jml +IQL,relocate-cloned-v1,tlab/CORL/runs/gqdep3in +IQL,hammer-human-v1,tlab/CORL/runs/ndzrylhn +IQL,hammer-human-v1,tlab/CORL/runs/w8tib4nq +IQL,hammer-human-v1,tlab/CORL/runs/ji2zsynd +IQL,hammer-human-v1,tlab/CORL/runs/jbno3nx0 +IQL,hammer-expert-v1,tlab/CORL/runs/2v1sm9t2 +IQL,hammer-expert-v1,tlab/CORL/runs/7h05ijht +IQL,hammer-expert-v1,tlab/CORL/runs/cnmncyas +IQL,hammer-expert-v1,tlab/CORL/runs/7oomdqw2 +IQL,hammer-cloned-v1,tlab/CORL/runs/sqq1maqv +IQL,hammer-cloned-v1,tlab/CORL/runs/51ci5vmi +IQL,hammer-cloned-v1,tlab/CORL/runs/376pgvjb +IQL,door-human-v1,tlab/CORL/runs/7cptqv3m +IQL,door-human-v1,tlab/CORL/runs/8qhgzgrd +IQL,door-human-v1,tlab/CORL/runs/dpdcjz32 +IQL,hammer-cloned-v1,tlab/CORL/runs/q46n2xc0 +IQL,door-expert-v1,tlab/CORL/runs/9xi9pp31 +IQL,door-human-v1,tlab/CORL/runs/r5s1taxi +IQL,door-expert-v1,tlab/CORL/runs/c9y9cygf +IQL,door-expert-v1,tlab/CORL/runs/ouh15i78 +IQL,door-expert-v1,tlab/CORL/runs/8tjub9t1 +IQL,door-cloned-v1,tlab/CORL/runs/eory1n9f +IQL,door-cloned-v1,tlab/CORL/runs/gmgn36xz +IQL,door-cloned-v1,tlab/CORL/runs/8sbctwih +IQL,door-cloned-v1,tlab/CORL/runs/jd2zor0j +IQL,pen-human-v1,tlab/CORL/runs/dc0wiy8m +IQL,pen-human-v1,tlab/CORL/runs/r1449frb +IQL,pen-human-v1,tlab/CORL/runs/zz4f7g85 +IQL,pen-human-v1,tlab/CORL/runs/vc5fzv6z +IQL,pen-expert-v1,tlab/CORL/runs/rysocpho +IQL,pen-expert-v1,tlab/CORL/runs/7cq4lpb9 +IQL,pen-expert-v1,tlab/CORL/runs/v2okidht +IQL,pen-expert-v1,tlab/CORL/runs/qdhdtyzq +IQL,pen-cloned-v1,tlab/CORL/runs/2l0ud08j +IQL,pen-cloned-v1,tlab/CORL/runs/048oglhq +IQL,pen-cloned-v1,tlab/CORL/runs/d6se70vf +IQL,pen-cloned-v1,tlab/CORL/runs/siglsbpl +TD3+BC,relocate-human-v1,tlab/CORL/runs/tfj4102v +TD3+BC,relocate-human-v1,tlab/CORL/runs/504nu7lm +TD3+BC,relocate-human-v1,tlab/CORL/runs/yyol4l9r +TD3+BC,relocate-human-v1,tlab/CORL/runs/3slcqx3e +TD3+BC,relocate-expert-v1,tlab/CORL/runs/o7t0kda0 +TD3+BC,relocate-expert-v1,tlab/CORL/runs/x38fwus5 +TD3+BC,relocate-expert-v1,tlab/CORL/runs/cp5abb79 +TD3+BC,relocate-expert-v1,tlab/CORL/runs/2k1dv28e +TD3+BC,relocate-cloned-v1,tlab/CORL/runs/ftrcsjbr +TD3+BC,relocate-cloned-v1,tlab/CORL/runs/su0lc1eq +TD3+BC,relocate-cloned-v1,tlab/CORL/runs/23ozl6iu +TD3+BC,relocate-cloned-v1,tlab/CORL/runs/gk6hn2th +TD3+BC,hammer-human-v1,tlab/CORL/runs/1f9p7umj +TD3+BC,hammer-human-v1,tlab/CORL/runs/ktyl3py5 +TD3+BC,hammer-human-v1,tlab/CORL/runs/5nocwd0s +TD3+BC,hammer-human-v1,tlab/CORL/runs/19ep2hhf +TD3+BC,hammer-expert-v1,tlab/CORL/runs/8fvepzs1 +TD3+BC,hammer-expert-v1,tlab/CORL/runs/76h43fi7 +TD3+BC,hammer-expert-v1,tlab/CORL/runs/s6efgsfl +TD3+BC,hammer-expert-v1,tlab/CORL/runs/6fhsckee +TD3+BC,hammer-cloned-v1,tlab/CORL/runs/mn8wj4vx +TD3+BC,hammer-cloned-v1,tlab/CORL/runs/2njey5rs +TD3+BC,hammer-cloned-v1,tlab/CORL/runs/aak4jk4q +TD3+BC,hammer-cloned-v1,tlab/CORL/runs/4rpg8n7s +TD3+BC,door-human-v1,tlab/CORL/runs/o1w608uo +TD3+BC,door-human-v1,tlab/CORL/runs/3ah048zk +TD3+BC,door-human-v1,tlab/CORL/runs/tgo85qxi +TD3+BC,door-human-v1,tlab/CORL/runs/0na2k76t +TD3+BC,door-expert-v1,tlab/CORL/runs/fg3r94l7 +TD3+BC,door-expert-v1,tlab/CORL/runs/m2xkvcra +TD3+BC,door-expert-v1,tlab/CORL/runs/vj3tyaqq +TD3+BC,door-expert-v1,tlab/CORL/runs/k6wb9o7n +TD3+BC,door-cloned-v1,tlab/CORL/runs/y1658d82 +TD3+BC,door-cloned-v1,tlab/CORL/runs/jgmexzrz +TD3+BC,door-cloned-v1,tlab/CORL/runs/65woeq5y +TD3+BC,door-cloned-v1,tlab/CORL/runs/0soehz0q +TD3+BC,pen-human-v1,tlab/CORL/runs/veinxlsl +TD3+BC,pen-human-v1,tlab/CORL/runs/w623y6nm +TD3+BC,pen-human-v1,tlab/CORL/runs/904ibgqu +TD3+BC,pen-human-v1,tlab/CORL/runs/hycdfakt +TD3+BC,pen-expert-v1,tlab/CORL/runs/n3l285rv +TD3+BC,pen-expert-v1,tlab/CORL/runs/c63peo3w +TD3+BC,pen-expert-v1,tlab/CORL/runs/4ml1wstn +TD3+BC,pen-expert-v1,tlab/CORL/runs/tle46su1 +TD3+BC,pen-cloned-v1,tlab/CORL/runs/o966f6gc +TD3+BC,pen-cloned-v1,tlab/CORL/runs/r7ug0kww +TD3+BC,pen-cloned-v1,tlab/CORL/runs/gmoq93do +TD3+BC,pen-cloned-v1,tlab/CORL/runs/7wjpi5qq +BC,relocate-human-v1,tlab/CORL/runs/f65zvj4o +BC,relocate-human-v1,tlab/CORL/runs/p1zpysnh +BC,relocate-human-v1,tlab/CORL/runs/mv3fa0m0 +BC,relocate-human-v1,tlab/CORL/runs/8yfoy1lf +BC,relocate-expert-v1,tlab/CORL/runs/io5b9chm +BC,relocate-expert-v1,tlab/CORL/runs/o8mxy0ef +BC,relocate-expert-v1,tlab/CORL/runs/13dp14jz +BC,relocate-expert-v1,tlab/CORL/runs/9jwfaaq8 +BC,relocate-cloned-v1,tlab/CORL/runs/tpdeaioz +BC,relocate-cloned-v1,tlab/CORL/runs/4xj7wpkz +BC,relocate-cloned-v1,tlab/CORL/runs/blfigkcf +BC,relocate-cloned-v1,tlab/CORL/runs/s681x8c3 +BC,hammer-human-v1,tlab/CORL/runs/ohcmuvhf +BC,hammer-human-v1,tlab/CORL/runs/wky03m4f +BC,hammer-human-v1,tlab/CORL/runs/ospra6o4 +BC,hammer-human-v1,tlab/CORL/runs/4hgbkyii +BC,hammer-expert-v1,tlab/CORL/runs/veyiqvwm +BC,hammer-expert-v1,tlab/CORL/runs/iefob6py +BC,hammer-expert-v1,tlab/CORL/runs/0elxsh4v +BC,hammer-expert-v1,tlab/CORL/runs/97x5w6ri +BC,hammer-cloned-v1,tlab/CORL/runs/mceyva7f +BC,hammer-cloned-v1,tlab/CORL/runs/qmfznoe0 +BC,hammer-cloned-v1,tlab/CORL/runs/mh0f7tfa +BC,hammer-cloned-v1,tlab/CORL/runs/sufi5c7t +BC,door-human-v1,tlab/CORL/runs/ags47j3o +BC,door-human-v1,tlab/CORL/runs/7tolkiip +BC,door-human-v1,tlab/CORL/runs/plaufnly +BC,door-human-v1,tlab/CORL/runs/ccelg0rr +BC,door-expert-v1,tlab/CORL/runs/1l3879rw +BC,door-expert-v1,tlab/CORL/runs/yjnjzjdi +BC,door-expert-v1,tlab/CORL/runs/gzvbgoqc +BC,door-expert-v1,tlab/CORL/runs/8d8e2j92 +BC,door-cloned-v1,tlab/CORL/runs/60soq2ty +BC,door-cloned-v1,tlab/CORL/runs/vjddul64 +BC,door-cloned-v1,tlab/CORL/runs/n3n67jvq +BC,door-cloned-v1,tlab/CORL/runs/5qa4yvlg +BC,pen-human-v1,tlab/CORL/runs/g2tibpcp +BC,pen-human-v1,tlab/CORL/runs/iqh7t6l4 +BC,pen-human-v1,tlab/CORL/runs/e2mu7nd9 +BC,pen-human-v1,tlab/CORL/runs/wzmc2sxm +BC,pen-expert-v1,tlab/CORL/runs/wyk7fphz +BC,pen-expert-v1,tlab/CORL/runs/4gv0kee4 +BC,pen-expert-v1,tlab/CORL/runs/ud3ksqym +BC,pen-expert-v1,tlab/CORL/runs/aqwfqqor +BC,pen-cloned-v1,tlab/CORL/runs/pu252roz +BC,pen-cloned-v1,tlab/CORL/runs/vz9iz9kf +BC,pen-cloned-v1,tlab/CORL/runs/nqbvhn0k +BC,pen-cloned-v1,tlab/CORL/runs/5ki0sfuj +CQL,antmaze-umaze-diverse-v2,tlab/CORL/runs/23ucy4d8 +CQL,antmaze-umaze-diverse-v2,tlab/CORL/runs/89wa6a31 +CQL,antmaze-umaze-diverse-v2,tlab/CORL/runs/oq48z3ai +CQL,antmaze-umaze-diverse-v2,tlab/CORL/runs/rjqb59zf +CQL,antmaze-medium-diverse-v2,tlab/CORL/runs/4xhpny6g +CQL,antmaze-medium-diverse-v2,tlab/CORL/runs/9ricjc5x +CQL,antmaze-medium-diverse-v2,tlab/CORL/runs/smhkhtll +CQL,antmaze-medium-diverse-v2,tlab/CORL/runs/0gbh6te4 +CQL,antmaze-large-diverse-v2,tlab/CORL/runs/hdnlotos +CQL,antmaze-large-diverse-v2,tlab/CORL/runs/fnqhas74 +CQL,antmaze-large-diverse-v2,tlab/CORL/runs/lcw09by0 +CQL,antmaze-large-diverse-v2,tlab/CORL/runs/vnuqanhz +CQL,antmaze-umaze-v2,tlab/CORL/runs/5bg573bn +CQL,antmaze-umaze-v2,tlab/CORL/runs/hkcwdbwq +CQL,antmaze-umaze-v2,tlab/CORL/runs/fiuskdxp +CQL,antmaze-umaze-v2,tlab/CORL/runs/zjrjune9 +CQL,antmaze-medium-play-v2,tlab/CORL/runs/fk6axqvt +CQL,antmaze-medium-play-v2,tlab/CORL/runs/fw4tn8o0 +CQL,antmaze-medium-play-v2,tlab/CORL/runs/rqlvhl2o +CQL,antmaze-medium-play-v2,tlab/CORL/runs/z03uv1ho +CQL,antmaze-large-play-v2,tlab/CORL/runs/4ftf757w +CQL,antmaze-large-play-v2,tlab/CORL/runs/pbzcz5os +CQL,antmaze-large-play-v2,tlab/CORL/runs/xiyetw2k +CQL,antmaze-large-play-v2,tlab/CORL/runs/0czkrq9p +CQL,relocate-expert-v1,tlab/CORL/runs/uzjtfg2a +CQL,relocate-expert-v1,tlab/CORL/runs/g2clspvn +CQL,relocate-expert-v1,tlab/CORL/runs/dhy1bjj5 +CQL,relocate-expert-v1,tlab/CORL/runs/czhvz88z +CQL,relocate-cloned-v1,tlab/CORL/runs/9kl2dg7h +CQL,relocate-cloned-v1,tlab/CORL/runs/pigtti0c +CQL,relocate-cloned-v1,tlab/CORL/runs/qom6luqy +CQL,relocate-cloned-v1,tlab/CORL/runs/zodxeo4u +CQL,relocate-human-v1,tlab/CORL/runs/eb1kqmb2 +CQL,relocate-human-v1,tlab/CORL/runs/2v22it3v +CQL,relocate-human-v1,tlab/CORL/runs/ptnr258q +CQL,relocate-human-v1,tlab/CORL/runs/y626250d +CQL,hammer-expert-v1,tlab/CORL/runs/ijkgmg31 +CQL,hammer-expert-v1,tlab/CORL/runs/pk2pxjd3 +CQL,hammer-expert-v1,tlab/CORL/runs/3wmpla35 +CQL,hammer-expert-v1,tlab/CORL/runs/uasmxtx2 +CQL,hammer-cloned-v1,tlab/CORL/runs/sftyzwma +CQL,hammer-cloned-v1,tlab/CORL/runs/p5u9mbc2 +CQL,hammer-cloned-v1,tlab/CORL/runs/8mz694df +CQL,hammer-cloned-v1,tlab/CORL/runs/vsgfq9di +CQL,hammer-human-v1,tlab/CORL/runs/w82qd90y +CQL,hammer-human-v1,tlab/CORL/runs/fbyysiv5 +CQL,hammer-human-v1,tlab/CORL/runs/dynrxrlq +CQL,hammer-human-v1,tlab/CORL/runs/ooqzzqqp +CQL,door-expert-v1,tlab/CORL/runs/7ydxakid +CQL,door-expert-v1,tlab/CORL/runs/gb2jsliu +CQL,door-expert-v1,tlab/CORL/runs/ld3czzwk +CQL,door-expert-v1,tlab/CORL/runs/zg4mfm7n +CQL,door-cloned-v1,tlab/CORL/runs/aoxv3uez +CQL,door-cloned-v1,tlab/CORL/runs/ms221pm9 +CQL,door-cloned-v1,tlab/CORL/runs/d42y8jfn +CQL,door-cloned-v1,tlab/CORL/runs/09uu4sjo +CQL,door-human-v1,tlab/CORL/runs/tfv810xu +CQL,door-human-v1,tlab/CORL/runs/p8xgt780 +CQL,door-human-v1,tlab/CORL/runs/oxgngb29 +CQL,door-human-v1,tlab/CORL/runs/3afd4fue +CQL,pen-expert-v1,tlab/CORL/runs/009cras3 +CQL,pen-expert-v1,tlab/CORL/runs/4ovelyp9 +CQL,pen-expert-v1,tlab/CORL/runs/cz0kyrn1 +CQL,pen-expert-v1,tlab/CORL/runs/chb9ofqa +CQL,pen-cloned-v1,tlab/CORL/runs/fdr8nfxm +CQL,pen-cloned-v1,tlab/CORL/runs/qftxuezq +CQL,pen-cloned-v1,tlab/CORL/runs/oesbjsk6 +CQL,pen-cloned-v1,tlab/CORL/runs/s54asgni +CQL,pen-human-v1,tlab/CORL/runs/4677gtu2 +CQL,pen-human-v1,tlab/CORL/runs/kk4c2nk3 +CQL,pen-human-v1,tlab/CORL/runs/3vx498y2 +CQL,pen-human-v1,tlab/CORL/runs/8kkyonfp +CQL,maze2d-medium-v1,tlab/CORL/runs/mlf7uxr6 +CQL,maze2d-medium-v1,tlab/CORL/runs/v7d3kjzy +CQL,maze2d-medium-v1,tlab/CORL/runs/rv9pst9q +CQL,maze2d-medium-v1,tlab/CORL/runs/h8ufwg10 +CQL,maze2d-umaze-v1,tlab/CORL/runs/t9ijmo88 +CQL,maze2d-umaze-v1,tlab/CORL/runs/bnn0jer3 +CQL,maze2d-umaze-v1,tlab/CORL/runs/i2a998e2 +CQL,maze2d-umaze-v1,tlab/CORL/runs/9ismi4tl +CQL,maze2d-large-v1,tlab/CORL/runs/fhiu7v44 +CQL,maze2d-large-v1,tlab/CORL/runs/4q66w8u6 +CQL,maze2d-large-v1,tlab/CORL/runs/5dsnaaot +CQL,maze2d-large-v1,tlab/CORL/runs/8g9dg8vd +CQL,walker2d-medium-expert-v2,tlab/CORL/runs/tl0qw5wq +CQL,walker2d-medium-expert-v2,tlab/CORL/runs/tmp67g9j +CQL,walker2d-medium-expert-v2,tlab/CORL/runs/r5ot114r +CQL,walker2d-medium-expert-v2,tlab/CORL/runs/msq8m0tp +CQL,walker2d-medium-replay-v2,tlab/CORL/runs/r8q5rsa9 +CQL,walker2d-medium-replay-v2,tlab/CORL/runs/wngo8ejr +CQL,walker2d-medium-replay-v2,tlab/CORL/runs/ft3dko4q +CQL,walker2d-medium-replay-v2,tlab/CORL/runs/8q91eiry +CQL,walker2d-medium-v2,tlab/CORL/runs/fm36y935 +CQL,walker2d-medium-v2,tlab/CORL/runs/u3q29d2l +CQL,walker2d-medium-v2,tlab/CORL/runs/frgj318k +CQL,walker2d-medium-v2,tlab/CORL/runs/94om2uv0 +CQL,hopper-medium-expert-v2,tlab/CORL/runs/1bhq28y9 +CQL,hopper-medium-expert-v2,tlab/CORL/runs/09ibpo0r +CQL,hopper-medium-expert-v2,tlab/CORL/runs/io1xulc0 +CQL,hopper-medium-expert-v2,tlab/CORL/runs/ssnq58yr +CQL,hopper-medium-replay-v2,tlab/CORL/runs/4w4cv8yc +CQL,hopper-medium-replay-v2,tlab/CORL/runs/huqab2t4 +CQL,hopper-medium-replay-v2,tlab/CORL/runs/mzadhjpq +CQL,hopper-medium-replay-v2,tlab/CORL/runs/p87x77qd +CQL,hopper-medium-v2,tlab/CORL/runs/enpcog2z +CQL,hopper-medium-v2,tlab/CORL/runs/zqclhczv +CQL,hopper-medium-v2,tlab/CORL/runs/lkyvgtmx +CQL,hopper-medium-v2,tlab/CORL/runs/uj0jmzmf +CQL,halfcheetah-medium-expert-v2,tlab/CORL/runs/xkuuo1eh +CQL,halfcheetah-medium-expert-v2,tlab/CORL/runs/cl2pnfzk +CQL,halfcheetah-medium-expert-v2,tlab/CORL/runs/a2xrfqzu +CQL,halfcheetah-medium-expert-v2,tlab/CORL/runs/4dvqi459 +CQL,halfcheetah-medium-v2,tlab/CORL/runs/13k7alv7 +CQL,halfcheetah-medium-v2,tlab/CORL/runs/5hopxhsn +CQL,halfcheetah-medium-v2,tlab/CORL/runs/8otqc0ef +CQL,halfcheetah-medium-v2,tlab/CORL/runs/umgpse6d +CQL,halfcheetah-medium-replay-v2,tlab/CORL/runs/zssc3fqn +CQL,halfcheetah-medium-replay-v2,tlab/CORL/runs/kn40abun +CQL,halfcheetah-medium-replay-v2,tlab/CORL/runs/ybuze5mj +CQL,halfcheetah-medium-replay-v2,tlab/CORL/runs/yq376obj +SAC-N,relocate-human-v1,tlab/CORL/runs/y6fmekyd +SAC-N,relocate-human-v1,tlab/CORL/runs/ngtyzqo1 +SAC-N,relocate-human-v1,tlab/CORL/runs/vbqkgf5l +SAC-N,relocate-human-v1,tlab/CORL/runs/j2ab96m6 +SAC-N,relocate-expert-v1,tlab/CORL/runs/8ignnvvr +SAC-N,relocate-expert-v1,tlab/CORL/runs/mv66zfhw +SAC-N,relocate-expert-v1,tlab/CORL/runs/xj7egx3x +SAC-N,relocate-expert-v1,tlab/CORL/runs/pvgddh6o +SAC-N,relocate-cloned-v1,tlab/CORL/runs/eyee25fn +SAC-N,relocate-cloned-v1,tlab/CORL/runs/o7lvfjpa +SAC-N,relocate-cloned-v1,tlab/CORL/runs/2j2kvgtq +SAC-N,relocate-cloned-v1,tlab/CORL/runs/qkq7g5jc +SAC-N,hammer-human-v1,tlab/CORL/runs/1rvuo145 +SAC-N,hammer-human-v1,tlab/CORL/runs/um7ql2su +SAC-N,hammer-human-v1,tlab/CORL/runs/lfcatfiu +SAC-N,hammer-human-v1,tlab/CORL/runs/40ycjrj5 +SAC-N,hammer-expert-v1,tlab/CORL/runs/htencbzz +SAC-N,hammer-expert-v1,tlab/CORL/runs/eeginxcu +SAC-N,hammer-expert-v1,tlab/CORL/runs/kn9cnl37 +SAC-N,hammer-expert-v1,tlab/CORL/runs/14dhhoce +SAC-N,hammer-cloned-v1,tlab/CORL/runs/hflemx6n +SAC-N,hammer-cloned-v1,tlab/CORL/runs/638iwo1q +SAC-N,hammer-cloned-v1,tlab/CORL/runs/bezh1upd +SAC-N,hammer-cloned-v1,tlab/CORL/runs/swark17c +SAC-N,door-human-v1,tlab/CORL/runs/orrj3esc +SAC-N,door-human-v1,tlab/CORL/runs/i4bn37e8 +SAC-N,door-human-v1,tlab/CORL/runs/jdirytkz +SAC-N,door-human-v1,tlab/CORL/runs/9ad0e96t +SAC-N,door-expert-v1,tlab/CORL/runs/epzjfgfp +SAC-N,door-expert-v1,tlab/CORL/runs/uyhcmn5y +SAC-N,door-expert-v1,tlab/CORL/runs/jo76ainf +SAC-N,door-expert-v1,tlab/CORL/runs/jdtq0xb4 +SAC-N,door-cloned-v1,tlab/CORL/runs/zqx3zmup +SAC-N,door-cloned-v1,tlab/CORL/runs/lewsuoky +SAC-N,door-cloned-v1,tlab/CORL/runs/q5bmghdx +SAC-N,door-cloned-v1,tlab/CORL/runs/4z0kjhfn +SAC-N,pen-human-v1,tlab/CORL/runs/ifijn1hh +SAC-N,pen-human-v1,tlab/CORL/runs/ve5m2ybq +SAC-N,pen-human-v1,tlab/CORL/runs/2n7khux4 +SAC-N,pen-human-v1,tlab/CORL/runs/6b4znflm +SAC-N,pen-expert-v1,tlab/CORL/runs/4o03lnyy +SAC-N,pen-expert-v1,tlab/CORL/runs/gxh3gonx +SAC-N,pen-expert-v1,tlab/CORL/runs/3av3sks3 +SAC-N,pen-cloned-v1,tlab/CORL/runs/p03957er +SAC-N,pen-expert-v1,tlab/CORL/runs/ta4cwvkw +SAC-N,pen-cloned-v1,tlab/CORL/runs/dccu81fw +SAC-N,pen-cloned-v1,tlab/CORL/runs/7luq1t06 +SAC-N,pen-cloned-v1,tlab/CORL/runs/6o4ww5u3 +AWAC,relocate-human-v1,tlab/CORL/runs/koneghlm +AWAC,relocate-human-v1,tlab/CORL/runs/1wnicmy9 +AWAC,relocate-human-v1,tlab/CORL/runs/yev9j5kb +AWAC,relocate-human-v1,tlab/CORL/runs/2k3xn5ok +AWAC,relocate-expert-v1,tlab/CORL/runs/jpmjuqws +AWAC,relocate-expert-v1,tlab/CORL/runs/b7svog8d +AWAC,relocate-expert-v1,tlab/CORL/runs/pkrpttei +AWAC,relocate-expert-v1,tlab/CORL/runs/d4kyqbc6 +AWAC,relocate-cloned-v1,tlab/CORL/runs/hdkt971s +AWAC,relocate-cloned-v1,tlab/CORL/runs/n47rdpzw +AWAC,relocate-cloned-v1,tlab/CORL/runs/79vv8ybh +AWAC,relocate-cloned-v1,tlab/CORL/runs/36cdogtc +AWAC,hammer-human-v1,tlab/CORL/runs/3apz6bfy +AWAC,hammer-human-v1,tlab/CORL/runs/lxcdhzmh +AWAC,hammer-human-v1,tlab/CORL/runs/rardhaa4 +AWAC,hammer-human-v1,tlab/CORL/runs/jmm5nko1 +AWAC,hammer-expert-v1,tlab/CORL/runs/6pue3d7n +AWAC,hammer-expert-v1,tlab/CORL/runs/l3m1hut0 +AWAC,hammer-expert-v1,tlab/CORL/runs/u2advxu8 +AWAC,hammer-expert-v1,tlab/CORL/runs/zh9n7gj5 +AWAC,hammer-cloned-v1,tlab/CORL/runs/ie0p8obl +AWAC,hammer-cloned-v1,tlab/CORL/runs/x09vc5ou +AWAC,hammer-cloned-v1,tlab/CORL/runs/fe4gi02y +AWAC,hammer-cloned-v1,tlab/CORL/runs/rt589ldv +AWAC,door-human-v1,tlab/CORL/runs/u1jalueh +AWAC,door-human-v1,tlab/CORL/runs/91ymwtu2 +AWAC,door-human-v1,tlab/CORL/runs/0jhh1sgc +AWAC,door-human-v1,tlab/CORL/runs/cpuov2zh +AWAC,door-expert-v1,tlab/CORL/runs/8jm5wux2 +AWAC,door-expert-v1,tlab/CORL/runs/wajbnn4d +AWAC,door-expert-v1,tlab/CORL/runs/emnm0s46 +AWAC,door-expert-v1,tlab/CORL/runs/os0u6g89 +AWAC,door-cloned-v1,tlab/CORL/runs/9kufhdp1 +AWAC,door-cloned-v1,tlab/CORL/runs/1v9kcj4z +AWAC,door-cloned-v1,tlab/CORL/runs/sl3bpdhr +AWAC,door-cloned-v1,tlab/CORL/runs/jvxi0i16 +AWAC,pen-human-v1,tlab/CORL/runs/af9635q8 +AWAC,pen-human-v1,tlab/CORL/runs/7xsixq6t +AWAC,pen-human-v1,tlab/CORL/runs/v2xi2ulz +AWAC,pen-human-v1,tlab/CORL/runs/6nc1lkvg +AWAC,pen-expert-v1,tlab/CORL/runs/6b1p20q8 +AWAC,pen-expert-v1,tlab/CORL/runs/n38xryr9 +AWAC,pen-expert-v1,tlab/CORL/runs/t44bqr3j +AWAC,pen-expert-v1,tlab/CORL/runs/5b6anbbp +AWAC,pen-cloned-v1,tlab/CORL/runs/jvd44w4q +AWAC,pen-cloned-v1,tlab/CORL/runs/km1d8kg1 +AWAC,pen-cloned-v1,tlab/CORL/runs/fcdr5g0q +AWAC,pen-cloned-v1,tlab/CORL/runs/oha48xvi +10% BC,relocate-human-v1,tlab/CORL/runs/yin9f3yd +10% BC,relocate-human-v1,tlab/CORL/runs/pw05uhmu +10% BC,relocate-human-v1,tlab/CORL/runs/evcxywd2 +10% BC,relocate-human-v1,tlab/CORL/runs/sgln8nnp +10% BC,door-human-v1,tlab/CORL/runs/39acnkdw +10% BC,door-human-v1,tlab/CORL/runs/04s2l9yl +10% BC,door-human-v1,tlab/CORL/runs/70sogf17 +10% BC,door-human-v1,tlab/CORL/runs/f1bmq6x5 +10% BC,pen-human-v1,tlab/CORL/runs/n0i73136 +10% BC,pen-human-v1,tlab/CORL/runs/u1x5pkbm +10% BC,pen-human-v1,tlab/CORL/runs/n6x8bp0u +10% BC,pen-human-v1,tlab/CORL/runs/c5ghimet +10% BC,relocate-expert-v1,tlab/CORL/runs/6lj43177 +10% BC,relocate-expert-v1,tlab/CORL/runs/d9qsh6ja +10% BC,relocate-expert-v1,tlab/CORL/runs/n5z2o857 +10% BC,relocate-expert-v1,tlab/CORL/runs/hot1a16e +10% BC,relocate-cloned-v1,tlab/CORL/runs/cx5c9llm +10% BC,relocate-cloned-v1,tlab/CORL/runs/a8yhh1kh +10% BC,relocate-cloned-v1,tlab/CORL/runs/6kreyxso +10% BC,relocate-cloned-v1,tlab/CORL/runs/er1wc045 +10% BC,hammer-human-v1,tlab/CORL/runs/tpvrqg52 +10% BC,hammer-human-v1,tlab/CORL/runs/28d5rjje +10% BC,hammer-human-v1,tlab/CORL/runs/1rxwcfym +10% BC,hammer-human-v1,tlab/CORL/runs/flck5w8u +10% BC,hammer-expert-v1,tlab/CORL/runs/gckapfmf +10% BC,hammer-expert-v1,tlab/CORL/runs/y95gpl9g +10% BC,hammer-expert-v1,tlab/CORL/runs/vzalh68o +10% BC,hammer-expert-v1,tlab/CORL/runs/gh5rqlh7 +10% BC,hammer-cloned-v1,tlab/CORL/runs/505fybd5 +10% BC,hammer-cloned-v1,tlab/CORL/runs/jsvkg054 +10% BC,hammer-cloned-v1,tlab/CORL/runs/843kkx6e +10% BC,hammer-cloned-v1,tlab/CORL/runs/ouilhk59 +10% BC,door-expert-v1,tlab/CORL/runs/pzkiimti +10% BC,door-expert-v1,tlab/CORL/runs/av4aj7oj +10% BC,door-expert-v1,tlab/CORL/runs/zjvo7cop +10% BC,door-expert-v1,tlab/CORL/runs/8lk9aoh4 +10% BC,door-cloned-v1,tlab/CORL/runs/y1ifoeoz +10% BC,door-cloned-v1,tlab/CORL/runs/63ggz7zy +10% BC,door-cloned-v1,tlab/CORL/runs/3i6l6r94 +10% BC,door-cloned-v1,tlab/CORL/runs/ked7pm92 +10% BC,pen-expert-v1,tlab/CORL/runs/3cnvhi98 +10% BC,pen-expert-v1,tlab/CORL/runs/7wwrrdub +10% BC,pen-expert-v1,tlab/CORL/runs/9oyo815f +10% BC,pen-expert-v1,tlab/CORL/runs/4pnv9qdi +10% BC,pen-cloned-v1,tlab/CORL/runs/mcitpl4d +10% BC,pen-cloned-v1,tlab/CORL/runs/n3mp779g +10% BC,pen-cloned-v1,tlab/CORL/runs/tr6vmlfb +10% BC,pen-cloned-v1,tlab/CORL/runs/3a3g4sxn +DT,relocate-human-v1,tlab/CORL/runs/tmph110d +DT,relocate-human-v1,tlab/CORL/runs/ltlzvjpk +DT,relocate-human-v1,tlab/CORL/runs/2cjbur1h +DT,relocate-human-v1,tlab/CORL/runs/3o7x5zk9 +DT,relocate-expert-v1,tlab/CORL/runs/zk6a2mcd +DT,relocate-expert-v1,tlab/CORL/runs/qm9xasei +DT,relocate-expert-v1,tlab/CORL/runs/raybm4k1 +DT,relocate-expert-v1,tlab/CORL/runs/072ciy8d +DT,relocate-cloned-v1,tlab/CORL/runs/0d7mwrek +DT,relocate-cloned-v1,tlab/CORL/runs/rl29ga8y +DT,relocate-cloned-v1,tlab/CORL/runs/7x6bdpmz +DT,relocate-cloned-v1,tlab/CORL/runs/ccvsi4v0 +DT,hammer-human-v1,tlab/CORL/runs/s0vvf0jg +DT,hammer-human-v1,tlab/CORL/runs/h9i2s25u +DT,hammer-human-v1,tlab/CORL/runs/1gwf91ld +DT,hammer-human-v1,tlab/CORL/runs/r5x4xjuy +DT,hammer-expert-v1,tlab/CORL/runs/i9f5n8uw +DT,hammer-expert-v1,tlab/CORL/runs/cizt4h9c +DT,hammer-expert-v1,tlab/CORL/runs/fmoy7bgg +DT,hammer-expert-v1,tlab/CORL/runs/p0nn6v0u +DT,hammer-cloned-v1,tlab/CORL/runs/sux4p7bc +DT,hammer-cloned-v1,tlab/CORL/runs/xc8o2rc5 +DT,hammer-cloned-v1,tlab/CORL/runs/te2skk3h +DT,hammer-cloned-v1,tlab/CORL/runs/m3nxe4xk +DT,door-human-v1,tlab/CORL/runs/f89z7yay +DT,door-human-v1,tlab/CORL/runs/hicu5lff +DT,door-human-v1,tlab/CORL/runs/1gh2i8hx +DT,door-human-v1,tlab/CORL/runs/14j6pxj1 +DT,door-expert-v1,tlab/CORL/runs/xl154nbc +DT,door-expert-v1,tlab/CORL/runs/2p9ckc5t +DT,door-expert-v1,tlab/CORL/runs/a0ngsou0 +DT,door-expert-v1,tlab/CORL/runs/97rx82t8 +DT,door-cloned-v1,tlab/CORL/runs/hf6za1ef +DT,door-cloned-v1,tlab/CORL/runs/uxl2gh2b +DT,door-cloned-v1,tlab/CORL/runs/jm5vbkxw +DT,door-cloned-v1,tlab/CORL/runs/kdxu6obu +DT,pen-human-v1,tlab/CORL/runs/afpw3bc6 +DT,pen-human-v1,tlab/CORL/runs/d0x6pyuq +DT,pen-human-v1,tlab/CORL/runs/j50lnc26 +DT,pen-human-v1,tlab/CORL/runs/2335kk3e +DT,pen-expert-v1,tlab/CORL/runs/8224ikaw +DT,pen-expert-v1,tlab/CORL/runs/r8ritqs9 +DT,pen-expert-v1,tlab/CORL/runs/x9vrnim5 +DT,pen-expert-v1,tlab/CORL/runs/rj4if1sh +DT,pen-cloned-v1,tlab/CORL/runs/b3ft5mzg +DT,pen-cloned-v1,tlab/CORL/runs/mnhbgmbq +DT,pen-cloned-v1,tlab/CORL/runs/i8ly92na +DT,pen-cloned-v1,tlab/CORL/runs/9cglug0v +EDAC,relocate-human-v1,tlab/CORL/runs/eqhdb3gt +EDAC,hammer-expert-v1,tlab/CORL/runs/rwwcdgeq +EDAC,hammer-expert-v1,tlab/CORL/runs/ma8j04l1 +EDAC,door-expert-v1,tlab/CORL/runs/9pcl81nq +EDAC,door-expert-v1,tlab/CORL/runs/cpg8brzz +EDAC,relocate-expert-v1,tlab/CORL/runs/7dwh17mo +EDAC,relocate-expert-v1,tlab/CORL/runs/zyxe8qaj +EDAC,relocate-expert-v1,tlab/CORL/runs/0st05lth +EDAC,relocate-expert-v1,tlab/CORL/runs/intmaaa8 +EDAC,hammer-expert-v1,tlab/CORL/runs/qd2uqu1k +EDAC,hammer-expert-v1,tlab/CORL/runs/0t21iy35 +EDAC,door-expert-v1,tlab/CORL/runs/3sqh1joa +EDAC,door-expert-v1,tlab/CORL/runs/5hzic4sb +EDAC,pen-expert-v1,tlab/CORL/runs/yj2cma9e +EDAC,pen-expert-v1,tlab/CORL/runs/zm89ur97 +EDAC,pen-expert-v1,tlab/CORL/runs/jn71kgzm +EDAC,pen-expert-v1,tlab/CORL/runs/4b3vdkex +EDAC,relocate-human-v1,tlab/CORL/runs/y8534wji +EDAC,relocate-human-v1,tlab/CORL/runs/jv8pu1um +EDAC,relocate-human-v1,tlab/CORL/runs/1ur7waoa +EDAC,relocate-cloned-v1,tlab/CORL/runs/wn2hqhqp +EDAC,relocate-cloned-v1,tlab/CORL/runs/omok4336 +EDAC,relocate-cloned-v1,tlab/CORL/runs/kg5ycrcx +EDAC,relocate-cloned-v1,tlab/CORL/runs/cqrsvhl8 +EDAC,hammer-human-v1,tlab/CORL/runs/b9wzvju6 +EDAC,hammer-human-v1,tlab/CORL/runs/ff3hpp7h +EDAC,hammer-human-v1,tlab/CORL/runs/7xdbncab +EDAC,hammer-human-v1,tlab/CORL/runs/n0xavhn3 +EDAC,hammer-cloned-v1,tlab/CORL/runs/75jxwrfw +EDAC,hammer-cloned-v1,tlab/CORL/runs/32dcllfk +EDAC,hammer-cloned-v1,tlab/CORL/runs/gmtobkw3 +EDAC,hammer-cloned-v1,tlab/CORL/runs/r5tu153u +EDAC,door-human-v1,tlab/CORL/runs/p9m4ctv1 +EDAC,door-human-v1,tlab/CORL/runs/sz7xpwae +EDAC,door-human-v1,tlab/CORL/runs/3necfesb +EDAC,door-human-v1,tlab/CORL/runs/sxlywf0z +EDAC,door-cloned-v1,tlab/CORL/runs/svdq0d88 +EDAC,door-cloned-v1,tlab/CORL/runs/r5od718p +EDAC,door-cloned-v1,tlab/CORL/runs/m58fi9zm +EDAC,door-cloned-v1,tlab/CORL/runs/isp6o9jw +EDAC,pen-human-v1,tlab/CORL/runs/olb1o4q6 +EDAC,pen-human-v1,tlab/CORL/runs/v16tnr1h +EDAC,pen-human-v1,tlab/CORL/runs/v2om24m3 +EDAC,pen-human-v1,tlab/CORL/runs/uia7byut +EDAC,pen-cloned-v1,tlab/CORL/runs/xt2wslvy +EDAC,pen-cloned-v1,tlab/CORL/runs/shu1cif0 +EDAC,pen-cloned-v1,tlab/CORL/runs/705x3qr3 +EDAC,pen-cloned-v1,tlab/CORL/runs/4nsovy9n +ReBRAC,relocate-expert-v1,tlab/CORL/runs/vn4ez49k +ReBRAC,relocate-expert-v1,tlab/CORL/runs/g8d06ol3 +ReBRAC,relocate-expert-v1,tlab/CORL/runs/p8bgmf74 +ReBRAC,relocate-expert-v1,tlab/CORL/runs/mnzuyed7 +ReBRAC,relocate-cloned-v1,tlab/CORL/runs/ti7m9b2n +ReBRAC,relocate-cloned-v1,tlab/CORL/runs/rv4tgl66 +ReBRAC,relocate-cloned-v1,tlab/CORL/runs/icge7fyw +ReBRAC,relocate-cloned-v1,tlab/CORL/runs/rficipdh +ReBRAC,relocate-human-v1,tlab/CORL/runs/54czmjif +ReBRAC,relocate-human-v1,tlab/CORL/runs/o3p3xuav +ReBRAC,relocate-human-v1,tlab/CORL/runs/s0ausun4 +ReBRAC,relocate-human-v1,tlab/CORL/runs/7kmnp2j8 +ReBRAC,hammer-expert-v1,tlab/CORL/runs/ikzl1j8v +ReBRAC,hammer-expert-v1,tlab/CORL/runs/25qxgcib +ReBRAC,hammer-expert-v1,tlab/CORL/runs/41l65wfu +ReBRAC,hammer-expert-v1,tlab/CORL/runs/ath9ht3x +ReBRAC,hammer-cloned-v1,tlab/CORL/runs/faagln3i +ReBRAC,hammer-cloned-v1,tlab/CORL/runs/k8m3gmuz +ReBRAC,hammer-cloned-v1,tlab/CORL/runs/oe4w3veb +ReBRAC,hammer-cloned-v1,tlab/CORL/runs/tk5ahs8c +ReBRAC,hammer-human-v1,tlab/CORL/runs/vhg9z4bc +ReBRAC,hammer-human-v1,tlab/CORL/runs/n93ypheg +ReBRAC,hammer-human-v1,tlab/CORL/runs/rbi293qx +ReBRAC,hammer-human-v1,tlab/CORL/runs/mitrmfn5 +ReBRAC,door-expert-v1,tlab/CORL/runs/6n9ky2jo +ReBRAC,door-expert-v1,tlab/CORL/runs/opkr8hsy +ReBRAC,door-expert-v1,tlab/CORL/runs/yh40zrig +ReBRAC,door-expert-v1,tlab/CORL/runs/ggzfd1go +ReBRAC,door-cloned-v1,tlab/CORL/runs/zqip5ab6 +ReBRAC,door-cloned-v1,tlab/CORL/runs/06gezcrb +ReBRAC,door-cloned-v1,tlab/CORL/runs/7ws1hy8l +ReBRAC,door-cloned-v1,tlab/CORL/runs/4gu3sufn +ReBRAC,door-human-v1,tlab/CORL/runs/shj6pgbm +ReBRAC,door-human-v1,tlab/CORL/runs/kbnipbeq +ReBRAC,door-human-v1,tlab/CORL/runs/mg4nm6vv +ReBRAC,door-human-v1,tlab/CORL/runs/ay1tuti1 +ReBRAC,pen-expert-v1,tlab/CORL/runs/selo914t +ReBRAC,pen-expert-v1,tlab/CORL/runs/arktwhi2 +ReBRAC,pen-expert-v1,tlab/CORL/runs/noubow6t +ReBRAC,pen-expert-v1,tlab/CORL/runs/9rx110fb +ReBRAC,pen-cloned-v1,tlab/CORL/runs/tf4o48qz +ReBRAC,pen-cloned-v1,tlab/CORL/runs/zqcg65jh +ReBRAC,pen-cloned-v1,tlab/CORL/runs/stjo4d23 +ReBRAC,pen-cloned-v1,tlab/CORL/runs/91j3or4h +ReBRAC,pen-human-v1,tlab/CORL/runs/mxviaea0 +ReBRAC,pen-human-v1,tlab/CORL/runs/ubsx6dvm +ReBRAC,pen-human-v1,tlab/CORL/runs/qmjfujym +ReBRAC,pen-human-v1,tlab/CORL/runs/a1u3jkgr +ReBRAC,maze2d-medium-v1,tlab/CORL/runs/pccgoyjx +ReBRAC,maze2d-medium-v1,tlab/CORL/runs/za0w853s +ReBRAC,maze2d-medium-v1,tlab/CORL/runs/mk05xlnb +ReBRAC,maze2d-medium-v1,tlab/CORL/runs/n0gpl32v +ReBRAC,maze2d-umaze-v1,tlab/CORL/runs/42pvz9q0 +ReBRAC,maze2d-umaze-v1,tlab/CORL/runs/azgdfdti +ReBRAC,maze2d-umaze-v1,tlab/CORL/runs/zk8kc0fn +ReBRAC,maze2d-umaze-v1,tlab/CORL/runs/cvarfx7c +ReBRAC,maze2d-large-v1,tlab/CORL/runs/3rc0eydd +ReBRAC,maze2d-large-v1,tlab/CORL/runs/l0uradyk +ReBRAC,maze2d-large-v1,tlab/CORL/runs/js0dsvf3 +ReBRAC,maze2d-large-v1,tlab/CORL/runs/4d482gx0 +ReBRAC,walker2d-medium-expert-v2,tlab/CORL/runs/9rliqoxs +ReBRAC,walker2d-medium-expert-v2,tlab/CORL/runs/7a7f6oin +ReBRAC,walker2d-medium-expert-v2,tlab/CORL/runs/exe9s8k9 +ReBRAC,walker2d-medium-expert-v2,tlab/CORL/runs/b0wldnir +ReBRAC,walker2d-medium-replay-v2,tlab/CORL/runs/hs9uy9jb +ReBRAC,walker2d-medium-replay-v2,tlab/CORL/runs/iu7cgo06 +ReBRAC,walker2d-medium-replay-v2,tlab/CORL/runs/mzwmck23 +ReBRAC,walker2d-medium-replay-v2,tlab/CORL/runs/ibu99k1i +ReBRAC,walker2d-medium-v2,tlab/CORL/runs/d5uv3tn2 +ReBRAC,walker2d-medium-v2,tlab/CORL/runs/i9u7dhjg +ReBRAC,walker2d-medium-v2,tlab/CORL/runs/4c8rq859 +ReBRAC,walker2d-medium-v2,tlab/CORL/runs/z8orj8mi +ReBRAC,hopper-medium-expert-v2,tlab/CORL/runs/ccz09vyg +ReBRAC,hopper-medium-expert-v2,tlab/CORL/runs/kkaqq4hk +ReBRAC,hopper-medium-expert-v2,tlab/CORL/runs/qm5gygoh +ReBRAC,hopper-medium-replay-v2,tlab/CORL/runs/mim76dgi +ReBRAC,hopper-medium-expert-v2,tlab/CORL/runs/sej060m0 +ReBRAC,hopper-medium-replay-v2,tlab/CORL/runs/aq1cbbug +ReBRAC,hopper-medium-replay-v2,tlab/CORL/runs/l2rkiqra +ReBRAC,hopper-medium-replay-v2,tlab/CORL/runs/bhlcsdxg +ReBRAC,hopper-medium-v2,tlab/CORL/runs/g4u9as4g +ReBRAC,hopper-medium-v2,tlab/CORL/runs/cogt9yoq +ReBRAC,hopper-medium-v2,tlab/CORL/runs/gtzsdrtj +ReBRAC,hopper-medium-v2,tlab/CORL/runs/qd0wsplq +ReBRAC,antmaze-umaze-diverse-v2,tlab/CORL/runs/08slc60g +ReBRAC,antmaze-umaze-diverse-v2,tlab/CORL/runs/lhkhx4da +ReBRAC,antmaze-umaze-diverse-v2,tlab/CORL/runs/340db78d +ReBRAC,antmaze-umaze-diverse-v2,tlab/CORL/runs/g89hn72w +ReBRAC,antmaze-medium-diverse-v2,tlab/CORL/runs/9mo8mzm4 +ReBRAC,antmaze-medium-diverse-v2,tlab/CORL/runs/smoz0poy +ReBRAC,antmaze-medium-diverse-v2,tlab/CORL/runs/mihj7jfo +ReBRAC,antmaze-medium-diverse-v2,tlab/CORL/runs/yhtxvmcl +ReBRAC,antmaze-large-diverse-v2,tlab/CORL/runs/ptkuaufn +ReBRAC,antmaze-large-diverse-v2,tlab/CORL/runs/i3gjr802 +ReBRAC,antmaze-large-diverse-v2,tlab/CORL/runs/r0oclfdb +ReBRAC,antmaze-large-diverse-v2,tlab/CORL/runs/9k0czvd7 +ReBRAC,antmaze-umaze-v2,tlab/CORL/runs/pb9nlk3v +ReBRAC,antmaze-umaze-v2,tlab/CORL/runs/mtmcpa7s +ReBRAC,antmaze-umaze-v2,tlab/CORL/runs/xngmmtcn +ReBRAC,antmaze-umaze-v2,tlab/CORL/runs/9qfnrlz9 +ReBRAC,antmaze-medium-play-v2,tlab/CORL/runs/66kotsxy +ReBRAC,antmaze-medium-play-v2,tlab/CORL/runs/djv9iy8j +ReBRAC,antmaze-large-play-v2,tlab/CORL/runs/4b9v36xr +ReBRAC,antmaze-large-play-v2,tlab/CORL/runs/bd41eemk +ReBRAC,antmaze-medium-play-v2,tlab/CORL/runs/cy4r6yhr +ReBRAC,antmaze-large-play-v2,tlab/CORL/runs/tos7myn4 +ReBRAC,antmaze-medium-play-v2,tlab/CORL/runs/yvqqvk6q +ReBRAC,antmaze-large-play-v2,tlab/CORL/runs/i2o4ia5p +ReBRAC,halfcheetah-medium-expert-v2,tlab/CORL/runs/nurhnvul +ReBRAC,halfcheetah-medium-expert-v2,tlab/CORL/runs/0u9knf8z +ReBRAC,halfcheetah-medium-expert-v2,tlab/CORL/runs/mp6dxzyr +ReBRAC,halfcheetah-medium-expert-v2,tlab/CORL/runs/9g3xyww9 +ReBRAC,halfcheetah-medium-v2,tlab/CORL/runs/g5jhlfig +ReBRAC,halfcheetah-medium-v2,tlab/CORL/runs/pvx7v4gl +ReBRAC,halfcheetah-medium-v2,tlab/CORL/runs/8s8icstj +ReBRAC,halfcheetah-medium-v2,tlab/CORL/runs/fva3s355 +ReBRAC,halfcheetah-medium-replay-v2,tlab/CORL/runs/jpysqyzy +ReBRAC,halfcheetah-medium-replay-v2,tlab/CORL/runs/3e0d74rq +ReBRAC,halfcheetah-medium-replay-v2,tlab/CORL/runs/e95mvbq9 +ReBRAC,halfcheetah-medium-replay-v2,tlab/CORL/runs/yht4bujk +BC,maze2d-large-v1,tlab/CORL/runs/gae6mjr6 +BC,maze2d-large-v1,tlab/CORL/runs/3dda9gfw +BC,maze2d-large-v1,tlab/CORL/runs/3sgbj9n0 +BC,maze2d-large-v1,tlab/CORL/runs/67eno4ma +BC,maze2d-medium-v1,tlab/CORL/runs/3bur5hke +BC,maze2d-medium-v1,tlab/CORL/runs/330z0l2v +BC,maze2d-medium-v1,tlab/CORL/runs/1i05t3vj +BC,maze2d-medium-v1,tlab/CORL/runs/k9yfle3x +BC,maze2d-umaze-v1,tlab/CORL/runs/1zreo8zw +BC,maze2d-umaze-v1,tlab/CORL/runs/18vbgvb2 +BC,maze2d-umaze-v1,tlab/CORL/runs/ky3vncuf +BC,maze2d-umaze-v1,tlab/CORL/runs/3tz0z6nh +BC,halfcheetah-medium-v2,tlab/CORL/runs/31dmbfoz +BC,halfcheetah-medium-v2,tlab/CORL/runs/1rhop7f6 +BC,halfcheetah-medium-v2,tlab/CORL/runs/2q070txr +BC,halfcheetah-medium-v2,tlab/CORL/runs/sbcrq218 +BC,halfcheetah-medium-replay-v2,tlab/CORL/runs/28iujcoa +BC,halfcheetah-medium-replay-v2,tlab/CORL/runs/2f12hcq3 +BC,halfcheetah-medium-replay-v2,tlab/CORL/runs/1ptuak40 +BC,halfcheetah-medium-replay-v2,tlab/CORL/runs/36y8187b +BC,halfcheetah-medium-expert-v2,tlab/CORL/runs/3bn0h2zy +BC,halfcheetah-medium-expert-v2,tlab/CORL/runs/3joz13bc +BC,halfcheetah-medium-expert-v2,tlab/CORL/runs/3s9l1a83 +BC,halfcheetah-medium-expert-v2,tlab/CORL/runs/1q966noh +BC,hopper-medium-v2,tlab/CORL/runs/2b85pbgd +BC,hopper-medium-v2,tlab/CORL/runs/ca0nxbh4 +BC,hopper-medium-v2,tlab/CORL/runs/1ipey1bk +BC,hopper-medium-v2,tlab/CORL/runs/x35k6x12 +BC,hopper-medium-replay-v2,tlab/CORL/runs/1owdjob7 +BC,hopper-medium-replay-v2,tlab/CORL/runs/xoosoz9n +BC,hopper-medium-replay-v2,tlab/CORL/runs/3r09yx27 +BC,hopper-medium-replay-v2,tlab/CORL/runs/3k5v2mso +BC,hopper-medium-expert-v2,tlab/CORL/runs/39tqleqs +BC,hopper-medium-expert-v2,tlab/CORL/runs/9cddvu7a +BC,hopper-medium-expert-v2,tlab/CORL/runs/17v5isiw +BC,hopper-medium-expert-v2,tlab/CORL/runs/2a8wzq2t +BC,walker2d-medium-v2,tlab/CORL/runs/1tgqpiks +BC,walker2d-medium-v2,tlab/CORL/runs/19yfj5xu +BC,walker2d-medium-v2,tlab/CORL/runs/2bneh6uw +BC,walker2d-medium-v2,tlab/CORL/runs/3twop214 +BC,walker2d-medium-replay-v2,tlab/CORL/runs/rhkaisgq +BC,walker2d-medium-replay-v2,tlab/CORL/runs/287bzpdd +BC,walker2d-medium-replay-v2,tlab/CORL/runs/l2gfzbhg +BC,walker2d-medium-replay-v2,tlab/CORL/runs/3gnugxzy +BC,walker2d-medium-expert-v2,tlab/CORL/runs/2uwtj2md +BC,walker2d-medium-expert-v2,tlab/CORL/runs/60yn1nfx +BC,walker2d-medium-expert-v2,tlab/CORL/runs/2p0w55iq +BC,walker2d-medium-expert-v2,tlab/CORL/runs/2rv6pvln +10% BC,maze2d-large-v1,tlab/CORL/runs/84b74c6e-bc52-4083-a601-6a387726c61d +10% BC,maze2d-large-v1,tlab/CORL/runs/e22c302b-e387-4d12-a498-db1c7b787306 +10% BC,maze2d-large-v1,tlab/CORL/runs/c76a5b7c-f459-498e-9aa9-6c0366ded313 +10% BC,maze2d-large-v1,tlab/CORL/runs/dafaa4dc-9359-4feb-be9b-39c3dcadcdd4 +10% BC,maze2d-medium-v1,tlab/CORL/runs/7aff87ac-17e1-49a8-b52d-a210c9be9eee +10% BC,maze2d-medium-v1,tlab/CORL/runs/d14de446-beea-413f-ad5e-c90dfd0e790c +10% BC,maze2d-medium-v1,tlab/CORL/runs/d4713f18-520a-459e-80a6-0acd70d0710f +10% BC,maze2d-medium-v1,tlab/CORL/runs/dfbcb740-26ca-4bbf-9065-ad3ecd60c261 +10% BC,maze2d-umaze-v1,tlab/CORL/runs/134273d4-5eb7-4e42-a62b-b3a387a7a2a4 +10% BC,maze2d-umaze-v1,tlab/CORL/runs/f6b33b84-b8c4-42a9-aae4-0d12db4f8b92 +10% BC,maze2d-umaze-v1,tlab/CORL/runs/c8dff5d6-4b22-4e7f-a3b1-5913ae9b0aed +10% BC,maze2d-umaze-v1,tlab/CORL/runs/6d454981-bf52-4126-b4bc-436e566b76be +10% BC,halfcheetah-medium-v2,tlab/CORL/runs/5d7df542-1567-462f-8885-8c8a0e8a5d19 +10% BC,halfcheetah-medium-v2,tlab/CORL/runs/d1d0f883-1b1d-4429-8c3c-02de6c989cdb +10% BC,halfcheetah-medium-v2,tlab/CORL/runs/8ccf19da-a0e6-4267-a53a-276349aea3be +10% BC,halfcheetah-medium-v2,tlab/CORL/runs/2c0ea1a2-614b-414a-b6fc-baa9663891da +10% BC,halfcheetah-medium-replay-v2,tlab/CORL/runs/3cc3a7f7-8ff0-497c-a6e0-e6c5c5ca9688 +10% BC,halfcheetah-medium-replay-v2,tlab/CORL/runs/c0de3f56-a236-44a4-a532-04064af81b18 +10% BC,halfcheetah-medium-replay-v2,tlab/CORL/runs/f2f1507a-9066-4df1-962e-a3d9bed3015a +10% BC,halfcheetah-medium-replay-v2,tlab/CORL/runs/6313d5cf-9158-4585-9f48-cccbe1ff16f1 +10% BC,halfcheetah-medium-expert-v2,tlab/CORL/runs/ba6e7a6d-2548-4d8a-a35f-286782c3658e +10% BC,halfcheetah-medium-expert-v2,tlab/CORL/runs/ab521663-97d4-4b00-a992-b602d495f7d7 +10% BC,halfcheetah-medium-expert-v2,tlab/CORL/runs/f6c1e15a-23d4-472d-846f-e766a835d67b +10% BC,halfcheetah-medium-expert-v2,tlab/CORL/runs/482908a6-eb2e-4b3d-8254-0ef0124f488e +10% BC,hopper-medium-v2,tlab/CORL/runs/7fc8e114-0c73-4c47-977a-7f8d337dac1f +10% BC,hopper-medium-v2,tlab/CORL/runs/c2e4d867-a355-4030-b23f-e9845da0c4bf +10% BC,hopper-medium-v2,tlab/CORL/runs/cec9a1e2-a270-4270-861b-88535dcd4103 +10% BC,hopper-medium-v2,tlab/CORL/runs/dcc5696c-bc69-41a3-a4f7-2865a16651ef +10% BC,hopper-medium-replay-v2,tlab/CORL/runs/b86f27c4-05d0-43d8-b95e-b81edeb45144 +10% BC,hopper-medium-replay-v2,tlab/CORL/runs/364433ae-2974-48c7-a8e5-fc7606dbc819 +10% BC,hopper-medium-replay-v2,tlab/CORL/runs/ba1ae355-2945-4c82-a7be-49e421b59574 +10% BC,hopper-medium-replay-v2,tlab/CORL/runs/c9b94c6c-8a73-4259-848b-61f7b9386309 +10% BC,hopper-medium-expert-v2,tlab/CORL/runs/323e3a40-e919-4dd6-9d97-3e6f7a01b118 +10% BC,hopper-medium-expert-v2,tlab/CORL/runs/6065ffc6-8cee-45d8-b2e5-a600922a89cc +10% BC,hopper-medium-expert-v2,tlab/CORL/runs/b418e6f1-1fcc-43dc-b5e3-475c17d3da1a +10% BC,hopper-medium-expert-v2,tlab/CORL/runs/9b7add9a-d916-4ac8-9538-09d82ea6a7c4 +10% BC,walker2d-medium-v2,tlab/CORL/runs/0155fffe-76ae-4580-ba4a-c90d8c83c8d6 +10% BC,walker2d-medium-v2,tlab/CORL/runs/e7ea6fec-ac94-483f-af5a-c20790569efd +10% BC,walker2d-medium-v2,tlab/CORL/runs/af373d51-823c-4ebc-b863-3ffefb6ad5f0 +10% BC,walker2d-medium-v2,tlab/CORL/runs/82e587c5-afc5-47f3-b71c-734472174a19 +10% BC,walker2d-medium-replay-v2,tlab/CORL/runs/1bca103d-fa9b-405f-a4c3-f4f5aee161c1 +10% BC,walker2d-medium-replay-v2,tlab/CORL/runs/706ea73c-c148-4f2f-96c6-347e600ae566 +10% BC,walker2d-medium-replay-v2,tlab/CORL/runs/e51f8235-0ea3-4eb5-a2ff-67d159404783 +10% BC,walker2d-medium-replay-v2,tlab/CORL/runs/5cd02078-1a5b-4721-9070-c8a5d7bce477 +10% BC,walker2d-medium-expert-v2,tlab/CORL/runs/40eaf786-7305-46a0-8b4c-2dc608c9cf34 +10% BC,walker2d-medium-expert-v2,tlab/CORL/runs/4bceaa03-d8e6-4ec5-b417-d1007f4a7504 +10% BC,walker2d-medium-expert-v2,tlab/CORL/runs/e1f340a7-f659-4143-8c76-22d341532e9c +10% BC,walker2d-medium-expert-v2,tlab/CORL/runs/df22f73b-3904-4d3d-be82-8565a94f90a9 +TD3+BC,maze2d-large-v1,tlab/CORL/runs/3gmwuspv +TD3+BC,maze2d-large-v1,tlab/CORL/runs/hfnz06jo +TD3+BC,maze2d-large-v1,tlab/CORL/runs/22zd4qy5 +TD3+BC,maze2d-large-v1,tlab/CORL/runs/2je1ydbq +TD3+BC,maze2d-medium-v1,tlab/CORL/runs/2cn5kybz +TD3+BC,maze2d-medium-v1,tlab/CORL/runs/4wfevsn1 +TD3+BC,maze2d-medium-v1,tlab/CORL/runs/8uc5g9vl +TD3+BC,maze2d-medium-v1,tlab/CORL/runs/3q3i7kr4 +TD3+BC,maze2d-umaze-v1,tlab/CORL/runs/1383sspe +TD3+BC,maze2d-umaze-v1,tlab/CORL/runs/ujqk6bcx +TD3+BC,maze2d-umaze-v1,tlab/CORL/runs/2har775v +TD3+BC,maze2d-umaze-v1,tlab/CORL/runs/1t9zpxwq +TD3+BC,halfcheetah-medium-v2,tlab/CORL/runs/1manw8ou +TD3+BC,halfcheetah-medium-v2,tlab/CORL/runs/glmwyvtm +TD3+BC,halfcheetah-medium-v2,tlab/CORL/runs/99lixj21 +TD3+BC,halfcheetah-medium-v2,tlab/CORL/runs/21qd6jdk +TD3+BC,halfcheetah-medium-replay-v2,tlab/CORL/runs/13i7gvdv +TD3+BC,halfcheetah-medium-replay-v2,tlab/CORL/runs/lfnzn3ek +TD3+BC,halfcheetah-medium-replay-v2,tlab/CORL/runs/2iqxrf7v +TD3+BC,halfcheetah-medium-replay-v2,tlab/CORL/runs/28q8k0is +TD3+BC,halfcheetah-medium-expert-v2,tlab/CORL/runs/2klwm3m9 +TD3+BC,halfcheetah-medium-expert-v2,tlab/CORL/runs/vgj8gxc9 +TD3+BC,halfcheetah-medium-expert-v2,tlab/CORL/runs/1zpikd1i +TD3+BC,halfcheetah-medium-expert-v2,tlab/CORL/runs/3mhuu91m +TD3+BC,hopper-medium-v2,tlab/CORL/runs/o9cy1xot +TD3+BC,hopper-medium-v2,tlab/CORL/runs/9oorg18b +TD3+BC,hopper-medium-v2,tlab/CORL/runs/8umnr31k +TD3+BC,hopper-medium-v2,tlab/CORL/runs/8ay8wua0 +TD3+BC,hopper-medium-replay-v2,tlab/CORL/runs/36r6bciu +TD3+BC,hopper-medium-replay-v2,tlab/CORL/runs/3dhx3yws +TD3+BC,hopper-medium-replay-v2,tlab/CORL/runs/2xgt4p29 +TD3+BC,hopper-medium-replay-v2,tlab/CORL/runs/2i8f6fsw +TD3+BC,hopper-medium-expert-v2,tlab/CORL/runs/1pocua7w +TD3+BC,hopper-medium-expert-v2,tlab/CORL/runs/3apac4jp +TD3+BC,hopper-medium-expert-v2,tlab/CORL/runs/3axkszn9 +TD3+BC,hopper-medium-expert-v2,tlab/CORL/runs/iyy3p627 +TD3+BC,walker2d-medium-v2,tlab/CORL/runs/2evz37in +TD3+BC,walker2d-medium-v2,tlab/CORL/runs/rcuf9ji6 +TD3+BC,walker2d-medium-v2,tlab/CORL/runs/2nguxmuw +TD3+BC,walker2d-medium-v2,tlab/CORL/runs/563x3nqx +TD3+BC,walker2d-medium-replay-v2,tlab/CORL/runs/3pp38z95 +TD3+BC,walker2d-medium-replay-v2,tlab/CORL/runs/c7htx54f +TD3+BC,walker2d-medium-replay-v2,tlab/CORL/runs/35i1e9k3 +TD3+BC,walker2d-medium-replay-v2,tlab/CORL/runs/34kpercv +TD3+BC,walker2d-medium-expert-v2,tlab/CORL/runs/1y6a1ghl +TD3+BC,walker2d-medium-expert-v2,tlab/CORL/runs/1r5ja7w3 +TD3+BC,walker2d-medium-expert-v2,tlab/CORL/runs/2ksjowc8 +TD3+BC,walker2d-medium-expert-v2,tlab/CORL/runs/1v789w9r +DT,maze2d-large-v1,tlab/CORL/runs/f5447eae-38f5-404e-ab97-979d12a62dba +DT,maze2d-large-v1,tlab/CORL/runs/063ec049-6092-46fd-8d06-5c43aa0c8933 +DT,maze2d-large-v1,tlab/CORL/runs/517996bc-48dd-4cc5-a1a2-b599668dfb03 +DT,maze2d-large-v1,tlab/CORL/runs/cdb110c8-baed-4b72-9338-e2df069c1999 +DT,maze2d-medium-v1,tlab/CORL/runs/863ba3ad-2e15-4027-a561-50a1ce837a2e +DT,maze2d-medium-v1,tlab/CORL/runs/a120a194-2a4d-493f-a105-29e81c2167f3 +DT,maze2d-medium-v1,tlab/CORL/runs/db99a51a-20ec-4898-b432-7bed581b11eb +DT,maze2d-medium-v1,tlab/CORL/runs/ef619bf1-e43f-4ca0-b26a-e44a79c8d6c4 +DT,maze2d-umaze-v1,tlab/CORL/runs/d61f15f2-bb63-4b0e-8a3f-0a8397f85c99 +DT,maze2d-umaze-v1,tlab/CORL/runs/bc356f6c-ff8a-4fcb-8f7d-eda711bf187f +DT,maze2d-umaze-v1,tlab/CORL/runs/e55c1f59-4a22-4adf-90db-55b761184c31 +DT,maze2d-umaze-v1,tlab/CORL/runs/754eb9df-300b-4816-b483-1ecc8630d170 +DT,halfcheetah-medium-v2,tlab/CORL/runs/fcdf10b7-3f06-4950-89e5-0bb706d32fa2 +DT,halfcheetah-medium-v2,tlab/CORL/runs/3149b249-61b7-42b5-b62c-560263073ceb +DT,halfcheetah-medium-v2,tlab/CORL/runs/e3f4068c-2f7a-4d98-8bfe-71e5bcd37f60 +DT,halfcheetah-medium-v2,tlab/CORL/runs/07bafbb5-cef0-487f-9d18-43f5e6f41e5b +DT,halfcheetah-medium-replay-v2,tlab/CORL/runs/bdc16cb0-7ba1-44e5-a634-f7821849e911 +DT,halfcheetah-medium-replay-v2,tlab/CORL/runs/1c63037a-0f9e-4c92-8e30-f868e5899235 +DT,halfcheetah-medium-replay-v2,tlab/CORL/runs/49ccdf3d-49f8-43f7-ae5e-5f2166928b08 +DT,halfcheetah-medium-replay-v2,tlab/CORL/runs/86e2bdf2-bfc8-4dd8-b245-06f3c5948525 +DT,halfcheetah-medium-expert-v2,tlab/CORL/runs/b7865c5a-6382-4dfe-967d-f5f41caef859 +DT,halfcheetah-medium-expert-v2,tlab/CORL/runs/1a9ae20a-0ef3-4517-aa21-0114606e8e44 +DT,halfcheetah-medium-expert-v2,tlab/CORL/runs/68993e5b-f477-496e-ab8c-da7808851e31 +DT,halfcheetah-medium-expert-v2,tlab/CORL/runs/d9682650-69b2-4cce-832c-a0a5d63d7b87 +DT,hopper-medium-v2,tlab/CORL/runs/51b5a164-e6ab-4929-bf76-b786a3e40654 +DT,hopper-medium-v2,tlab/CORL/runs/abd10b19-e2c5-4e27-99ac-2ca8445acd51 +DT,hopper-medium-v2,tlab/CORL/runs/5c0c2cb0-2457-40dc-905b-8bf32b8a75fe +DT,hopper-medium-v2,tlab/CORL/runs/98977940-fab9-462c-ac70-3fcd10bc55cb +DT,hopper-medium-replay-v2,tlab/CORL/runs/a513ea52-a879-47a6-ab4c-ac1a046b5cc2 +DT,hopper-medium-replay-v2,tlab/CORL/runs/0cffd41b-d983-4b45-93c8-2e22fc5801c0 +DT,hopper-medium-replay-v2,tlab/CORL/runs/c7b8a1c8-170f-4060-860c-62553ff67911 +DT,hopper-medium-replay-v2,tlab/CORL/runs/7df0497b-d805-47ce-91ba-485d7bff6fb6 +DT,hopper-medium-expert-v2,tlab/CORL/runs/3db49470-beba-49f8-963b-bc7fbe79d107 +DT,hopper-medium-expert-v2,tlab/CORL/runs/21fea44e-168d-4356-a72c-1ac09a482d05 +DT,hopper-medium-expert-v2,tlab/CORL/runs/60a8e98b-5933-491e-83c7-f48b777fb52e +DT,hopper-medium-expert-v2,tlab/CORL/runs/7eaf035d-9394-4eee-97f0-50347b108b6a +DT,walker2d-medium-v2,tlab/CORL/runs/76b97aeb-4327-4fb1-bbd4-572f84b9ac6c +DT,walker2d-medium-v2,tlab/CORL/runs/2eaf20df-c7d2-42c7-9d6f-5f29e240b99f +DT,walker2d-medium-v2,tlab/CORL/runs/fa033830-cec7-4144-894d-741391fdb81d +DT,walker2d-medium-v2,tlab/CORL/runs/04917eeb-b7a5-4e02-9e89-7eed774cd00b +DT,walker2d-medium-replay-v2,tlab/CORL/runs/d296d6ef-8a37-4c39-be14-ab54eb85a0ee +DT,walker2d-medium-replay-v2,tlab/CORL/runs/825a83d5-0ed4-4c97-9c79-13edfa43e6cc +DT,walker2d-medium-replay-v2,tlab/CORL/runs/277df654-7035-4469-8150-ff3df3f6230e +DT,walker2d-medium-replay-v2,tlab/CORL/runs/6428588e-c9bc-43ba-a945-285248e0664b +DT,walker2d-medium-expert-v2,tlab/CORL/runs/0d1ae046-abcb-4da1-b2d3-1360bbd8f54f +DT,walker2d-medium-expert-v2,tlab/CORL/runs/9eb231d9-6c25-4d42-9564-90164b7e680b +DT,walker2d-medium-expert-v2,tlab/CORL/runs/f4c212ba-7b8e-428e-9953-71606fd84d67 +DT,walker2d-medium-expert-v2,tlab/CORL/runs/3bc164b8-1fc0-4ce5-a32d-701e522ad5b1 +SAC-N,maze2d-large-v1,tlab/CORL/runs/a7e3d2a0-2dbc-4eba-b28d-8315f992bae3 +SAC-N,maze2d-large-v1,tlab/CORL/runs/65981364-10fc-47d3-bb35-ccc67254ca23 +SAC-N,maze2d-large-v1,tlab/CORL/runs/ceb4bd07-50d4-426c-9e2b-a54fc4a1092a +SAC-N,maze2d-large-v1,tlab/CORL/runs/a2fe5d76-b680-42b1-aafa-4f7fae8e9575 +SAC-N,maze2d-medium-v1,tlab/CORL/runs/342b9c5e-eb78-45b1-99fc-97654d2d619a +SAC-N,maze2d-medium-v1,tlab/CORL/runs/eaab4d73-b002-4587-89e9-b101efc5c385 +SAC-N,maze2d-medium-v1,tlab/CORL/runs/f83b4b8c-bddd-469a-acf5-c2c59b80fd3c +SAC-N,maze2d-medium-v1,tlab/CORL/runs/4c2065f4-e773-4760-a045-18958aff4685 +SAC-N,maze2d-umaze-v1,tlab/CORL/runs/eef336bc-42f0-46bc-90df-17d6b5647263 +SAC-N,maze2d-umaze-v1,tlab/CORL/runs/16b37de3-9011-4a20-b58a-d1d97946125a +SAC-N,maze2d-umaze-v1,tlab/CORL/runs/81bdccf5-1ce7-4ab5-9228-1193209b9f85 +SAC-N,maze2d-umaze-v1,tlab/CORL/runs/700bc2bd-3ae8-4845-a5a7-ea9ce5a5bf68 +SAC-N,halfcheetah-medium-v2,tlab/CORL/runs/c0015d64-2bce-4bf7-a804-92390d022ec9 +SAC-N,halfcheetah-medium-v2,tlab/CORL/runs/f7a045fb-89de-4df1-a827-0b0aff6fa803 +SAC-N,halfcheetah-medium-v2,tlab/CORL/runs/c61cc412-51fa-41ef-be06-5e8eaba5272e +SAC-N,halfcheetah-medium-v2,tlab/CORL/runs/e08593b0-edc7-49a7-bf68-e66e613ed20f +SAC-N,halfcheetah-medium-replay-v2,tlab/CORL/runs/3be8a859-82e5-4cc2-899d-4ff7f88a90ed +SAC-N,halfcheetah-medium-replay-v2,tlab/CORL/runs/c5dd3800-eed4-4711-8172-0d22bc985ed9 +SAC-N,halfcheetah-medium-replay-v2,tlab/CORL/runs/ff761882-9f47-4f3b-8cf9-0f5cf0b40339 +SAC-N,halfcheetah-medium-replay-v2,tlab/CORL/runs/0257eae7-716d-4c68-b8a2-1d99c74d79d0 +SAC-N,halfcheetah-medium-expert-v2,tlab/CORL/runs/8c18b80d-028d-48dd-a371-b2fab308469a +SAC-N,halfcheetah-medium-expert-v2,tlab/CORL/runs/c86ba1cc-8b4c-4dd8-b64d-8f57a8131d95 +SAC-N,halfcheetah-medium-expert-v2,tlab/CORL/runs/bc5fda0c-2f5c-4391-8bd5-c4f2e15c2e0c +SAC-N,halfcheetah-medium-expert-v2,tlab/CORL/runs/c3fdffef-f3cb-4d18-9d94-af4e0651ba21 +SAC-N,hopper-medium-v2,tlab/CORL/runs/95c7d8e0-f634-403a-8edb-ea00afd5c69c +SAC-N,hopper-medium-v2,tlab/CORL/runs/4580d97f-15b0-4d54-887c-91cf0a3368ea +SAC-N,hopper-medium-v2,tlab/CORL/runs/ad47291b-1469-48b5-ba20-266a05bc9326 +SAC-N,hopper-medium-v2,tlab/CORL/runs/16f77985-8033-4953-8066-c33c49141581 +SAC-N,hopper-medium-replay-v2,tlab/CORL/runs/69bf1797-94b0-43fa-b22c-a6406a93d222 +SAC-N,hopper-medium-replay-v2,tlab/CORL/runs/dadbb413-ae11-48bb-a4bb-94c8b4c7d53f +SAC-N,hopper-medium-replay-v2,tlab/CORL/runs/c1db8aa9-9bfc-4687-a8b5-6096c90f6e9b +SAC-N,hopper-medium-replay-v2,tlab/CORL/runs/b6ff762e-c0be-4b6d-ac23-8b5ffcb28a56 +SAC-N,hopper-medium-expert-v2,tlab/CORL/runs/ab688db2-ab1d-4d96-ba40-6186c7ecb16b +SAC-N,hopper-medium-expert-v2,tlab/CORL/runs/d0a5c6be-7b64-4ddb-b965-1ae8e0533363 +SAC-N,hopper-medium-expert-v2,tlab/CORL/runs/9f67f421-c55b-4527-8ea0-8e6579a3bb61 +SAC-N,hopper-medium-expert-v2,tlab/CORL/runs/ab44a4d1-6aee-420e-b691-307bd083d2ea +SAC-N,walker2d-medium-v2,tlab/CORL/runs/3394eb73-a8b3-463c-9a57-8dd65833ecdd +SAC-N,walker2d-medium-v2,tlab/CORL/runs/89527361-8f90-47a5-8882-ac3459de0d0a +SAC-N,walker2d-medium-v2,tlab/CORL/runs/f02528e5-86d6-4242-961f-106cb0e5df14 +SAC-N,walker2d-medium-v2,tlab/CORL/runs/132a99bc-386a-4eb4-a64c-74699d0563b5 +SAC-N,walker2d-medium-replay-v2,tlab/CORL/runs/33ce900d-b858-4bc3-a6dc-71f9615cfad5 +SAC-N,walker2d-medium-replay-v2,tlab/CORL/runs/87addd3a-42bd-45b7-8dcb-a921dfa6dad5 +SAC-N,walker2d-medium-replay-v2,tlab/CORL/runs/bcfb639c-1d44-4228-bbd8-e560b48bb5d6 +SAC-N,walker2d-medium-replay-v2,tlab/CORL/runs/249f88e4-c98f-401f-bb36-4d5f239fff74 +SAC-N,walker2d-medium-expert-v2,tlab/CORL/runs/fc7fa907-ab00-457d-a00d-2bdd65688379 +SAC-N,walker2d-medium-expert-v2,tlab/CORL/runs/20f7258d-0f07-4002-86b2-4c3ec65ee067 +SAC-N,walker2d-medium-expert-v2,tlab/CORL/runs/c3e71147-80a2-4ae8-bb59-9b994daaa516 +SAC-N,walker2d-medium-expert-v2,tlab/CORL/runs/e36a72da-482f-4a70-803f-1a0d7eccb265 +EDAC,maze2d-large-v1,tlab/CORL/runs/1m3k2bd1 +EDAC,maze2d-large-v1,tlab/CORL/runs/3jzf46zg +EDAC,maze2d-large-v1,tlab/CORL/runs/exlzrv4v +EDAC,maze2d-large-v1,tlab/CORL/runs/3r2qku3k +EDAC,maze2d-medium-v1,tlab/CORL/runs/3crj1urn +EDAC,maze2d-medium-v1,tlab/CORL/runs/25vxky59 +EDAC,maze2d-medium-v1,tlab/CORL/runs/258aw9fy +EDAC,maze2d-medium-v1,tlab/CORL/runs/3oc7jc1q +EDAC,maze2d-umaze-v1,tlab/CORL/runs/31ak0z9b +EDAC,maze2d-umaze-v1,tlab/CORL/runs/hjl7pxfa +EDAC,maze2d-umaze-v1,tlab/CORL/runs/2qq9dfgc +EDAC,maze2d-umaze-v1,tlab/CORL/runs/c0pdrw6f +EDAC,halfcheetah-medium-v2,tlab/CORL/runs/5d588f87-fe51-4253-b310-a75fbf8d3702 +EDAC,halfcheetah-medium-v2,tlab/CORL/runs/10aa52ac-b2f4-43c4-97f1-4bee57fdab24 +EDAC,halfcheetah-medium-v2,tlab/CORL/runs/3500687d-84c6-4cc6-88a9-ac432fe83f42 +EDAC,halfcheetah-medium-v2,tlab/CORL/runs/2108ebe3-d55d-418a-9fda-f78a8337909a +EDAC,halfcheetah-medium-replay-v2,tlab/CORL/runs/8853c87c-9bdc-411e-8128-f0976c510485 +EDAC,halfcheetah-medium-replay-v2,tlab/CORL/runs/b86adeb5-282b-4f9b-bd4f-361b576c9988 +EDAC,halfcheetah-medium-replay-v2,tlab/CORL/runs/6b675ca0-3fed-498a-ae54-e964673158d4 +EDAC,halfcheetah-medium-replay-v2,tlab/CORL/runs/48813224-53a2-495e-86a2-d72a5b95ba94 +EDAC,halfcheetah-medium-expert-v2,tlab/CORL/runs/996be0e1-ae88-492d-b261-15f034cc6203 +EDAC,halfcheetah-medium-expert-v2,tlab/CORL/runs/62bcf801-db79-438e-b0f4-74436f3c67b1 +EDAC,halfcheetah-medium-expert-v2,tlab/CORL/runs/ffddfea8-2e9b-493b-88df-04a15f97d7a8 +EDAC,halfcheetah-medium-expert-v2,tlab/CORL/runs/b07eb900-8653-4688-a10f-111f3eb3c84a +EDAC,hopper-medium-v2,tlab/CORL/runs/59f743f9-3b3a-4306-83b5-98721508bf2f +EDAC,hopper-medium-v2,tlab/CORL/runs/74a7e942-ca43-44e8-85f7-976fa7dd2edd +EDAC,hopper-medium-v2,tlab/CORL/runs/20425c80-a0f3-4e1a-9991-a85db7012417 +EDAC,hopper-medium-v2,tlab/CORL/runs/6fb1e9e2-9485-40c9-ac77-b118cd9cc55b +EDAC,hopper-medium-replay-v2,tlab/CORL/runs/6145c71a-ce9b-4817-bf94-a6eef9b79377 +EDAC,hopper-medium-replay-v2,tlab/CORL/runs/c7d59200-7e0f-47a4-846a-123fb23d3c30 +EDAC,hopper-medium-replay-v2,tlab/CORL/runs/00379327-06d9-4117-9abb-0f4fef0d6f38 +EDAC,hopper-medium-replay-v2,tlab/CORL/runs/dc1c3646-d8fd-4671-b43c-b987441f70cf +EDAC,hopper-medium-expert-v2,tlab/CORL/runs/a58fedea-d5fe-4481-bca4-0e44989f049e +EDAC,hopper-medium-expert-v2,tlab/CORL/runs/05dc4e17-4c73-4f71-b5c3-2eb39aae36c8 +EDAC,hopper-medium-expert-v2,tlab/CORL/runs/155aa581-5e1f-4d32-acd5-edde7c5e3c6a +EDAC,hopper-medium-expert-v2,tlab/CORL/runs/5e5e6d1a-59c4-4044-9d50-7d1b920bb626 +EDAC,walker2d-medium-v2,tlab/CORL/runs/ffb22753-338f-4d2a-ba45-aaeba6a5eed3 +EDAC,walker2d-medium-v2,tlab/CORL/runs/6d1e8c3f-bd50-4e02-8adc-bf7db13d15ad +EDAC,walker2d-medium-v2,tlab/CORL/runs/f99181eb-499d-48be-b1e3-5349f8fe3731 +EDAC,walker2d-medium-v2,tlab/CORL/runs/fd8b7f41-48cc-4578-8fc8-55ec5e5884df +EDAC,walker2d-medium-replay-v2,tlab/CORL/runs/a0a92721-04b1-4868-809e-2ce37358516b +EDAC,walker2d-medium-replay-v2,tlab/CORL/runs/c484e9cd-ee4d-427a-941d-80926caa3128 +EDAC,walker2d-medium-replay-v2,tlab/CORL/runs/5790cb46-ea8c-42b6-abe6-a70faa0f4633 +EDAC,walker2d-medium-replay-v2,tlab/CORL/runs/ed665d8c-1bb5-4858-9136-574bf523b39a +EDAC,walker2d-medium-expert-v2,tlab/CORL/runs/1e6e9a77-a335-41e0-9e29-6271f5a4fcda +EDAC,walker2d-medium-expert-v2,tlab/CORL/runs/d6492463-82f1-4512-99fa-b23073d6b418 +EDAC,walker2d-medium-expert-v2,tlab/CORL/runs/96027203-781b-46ee-bf59-e565227f2f7b +EDAC,walker2d-medium-expert-v2,tlab/CORL/runs/d5f5f415-9d1b-4d35-b4e5-c1cf278af46c +AWAC,maze2d-large-v1,tlab/CORL/runs/3me14n0w +AWAC,maze2d-large-v1,tlab/CORL/runs/8671xq2j +AWAC,maze2d-large-v1,tlab/CORL/runs/3keq4k8a +AWAC,maze2d-large-v1,tlab/CORL/runs/3jq85ti0 +AWAC,maze2d-medium-v1,tlab/CORL/runs/1vvutaak +AWAC,maze2d-medium-v1,tlab/CORL/runs/16nzq1ng +AWAC,maze2d-medium-v1,tlab/CORL/runs/3552gil2 +AWAC,maze2d-medium-v1,tlab/CORL/runs/3l3dpq11 +AWAC,maze2d-umaze-v1,tlab/CORL/runs/3usi5cuh +AWAC,maze2d-umaze-v1,tlab/CORL/runs/2vvw9y8h +AWAC,maze2d-umaze-v1,tlab/CORL/runs/2vcog7cq +AWAC,maze2d-umaze-v1,tlab/CORL/runs/qp93j6we +AWAC,halfcheetah-medium-v2,tlab/CORL/runs/1n8ttdck +AWAC,halfcheetah-medium-v2,tlab/CORL/runs/1bpgemq2 +AWAC,halfcheetah-medium-v2,tlab/CORL/runs/39wb3kat +AWAC,halfcheetah-medium-v2,tlab/CORL/runs/w9i9g39x +AWAC,halfcheetah-medium-replay-v2,tlab/CORL/runs/3gfpaz8e +AWAC,halfcheetah-medium-replay-v2,tlab/CORL/runs/3aerk47s +AWAC,halfcheetah-medium-replay-v2,tlab/CORL/runs/275nzj65 +AWAC,halfcheetah-medium-replay-v2,tlab/CORL/runs/2fxchaks +AWAC,halfcheetah-medium-expert-v2,tlab/CORL/runs/220xo7sy +AWAC,halfcheetah-medium-expert-v2,tlab/CORL/runs/186848oq +AWAC,halfcheetah-medium-expert-v2,tlab/CORL/runs/2qcui7s9 +AWAC,halfcheetah-medium-expert-v2,tlab/CORL/runs/3izk7ats +AWAC,hopper-medium-v2,tlab/CORL/runs/3p8nop3c +AWAC,hopper-medium-v2,tlab/CORL/runs/2n4njt2r +AWAC,hopper-medium-v2,tlab/CORL/runs/cfgxmidd +AWAC,hopper-medium-v2,tlab/CORL/runs/o3jqikii +AWAC,hopper-medium-replay-v2,tlab/CORL/runs/1jg2th4m +AWAC,hopper-medium-replay-v2,tlab/CORL/runs/3qqk3v1v +AWAC,hopper-medium-replay-v2,tlab/CORL/runs/1og7e8w1 +AWAC,hopper-medium-replay-v2,tlab/CORL/runs/1hg2vtf9 +AWAC,hopper-medium-expert-v2,tlab/CORL/runs/3b6t3c8p +AWAC,hopper-medium-expert-v2,tlab/CORL/runs/i15nczq4 +AWAC,hopper-medium-expert-v2,tlab/CORL/runs/3v7jt3p7 +AWAC,hopper-medium-expert-v2,tlab/CORL/runs/2uvghydj +AWAC,walker2d-medium-v2,tlab/CORL/runs/3v1rznw2 +AWAC,walker2d-medium-v2,tlab/CORL/runs/2ov8rc9w +AWAC,walker2d-medium-v2,tlab/CORL/runs/3funjmu4 +AWAC,walker2d-medium-v2,tlab/CORL/runs/3o823qdi +AWAC,walker2d-medium-replay-v2,tlab/CORL/runs/21coamdv +AWAC,walker2d-medium-replay-v2,tlab/CORL/runs/35cmwtdl +AWAC,walker2d-medium-replay-v2,tlab/CORL/runs/3pvuqbr5 +AWAC,walker2d-medium-replay-v2,tlab/CORL/runs/ic2e00s6 +AWAC,walker2d-medium-expert-v2,tlab/CORL/runs/2utgl834 +AWAC,walker2d-medium-expert-v2,tlab/CORL/runs/3hvawfk9 +AWAC,walker2d-medium-expert-v2,tlab/CORL/runs/3mo9ld3q +AWAC,walker2d-medium-expert-v2,tlab/CORL/runs/1aihv0tw diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 6dc3b40a..00000000 --- a/setup.cfg +++ /dev/null @@ -1,34 +0,0 @@ -[flake8] -ignore = C408,D100,D101,D102,D103,D104,D104,D105,D106,D107,D202,D205,D212,D415,DAR101,DAR201,E731,N806,N812,NIP319,NIP322,NIP323,E203 -max-line-length = 89 -max-doc-length = 89 -inline-quotes = double -multiline-quotes = double -docstring-quotes = double -convention = google -docstring_style = google -strictness = short -per-file-ignores = - **/__init__.py:F401 - -[darglint] -ignore_regex=^_(.*) - -[isort] -combine_as_imports = true -order_by_type = false -force_grid_wrap = 0 -force_sort_within_sections = true -line_length = 89 -lines_between_types = 0 -multi_line_output = 3 -no_lines_before = STDLIB,LOCALFOLDER -reverse_relative = true -default_section = THIRDPARTY -known_first_party = animus -known_src = src -skip_glob = **/__init__.py -force_to_top = typing -include_trailing_comma = true -use_parentheses = true -sections = FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,SRC,LOCALFOLDER \ No newline at end of file