From e6c3f8590693aff521c3015235c14cfe6c3053ec Mon Sep 17 00:00:00 2001 From: Ruiyang Wang <56065503+rynewang@users.noreply.github.com> Date: Wed, 22 May 2024 23:03:19 -0700 Subject: [PATCH 01/65] [core] Deflake test_get_locations on windows. (#45506) Signed-off-by: Ruiyang Wang --- python/ray/tests/test_get_locations.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/ray/tests/test_get_locations.py b/python/ray/tests/test_get_locations.py index 74e1e149fea9..c833f07cbdad 100644 --- a/python/ray/tests/test_get_locations.py +++ b/python/ray/tests/test_get_locations.py @@ -151,8 +151,7 @@ def task(): class BigObject: def __init__(self): - # 100 MiB of memory used... - self.data = np.zeros((100 * 1024 * 1024), dtype=np.uint8) + self.data = np.zeros((BIG_OBJ_SIZE,), dtype=np.uint8) @ray.remote From d091bb5df7b0a8ec10d11ed53f44129f621c0c7d Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Thu, 23 May 2024 00:30:57 -0700 Subject: [PATCH 02/65] [Data] Allow user to configure timeout for actor pool (#45508) If your Dataset waits more than 10 minutes to launch GPU actors, your program fails. This PR allows you to configure this behavior so that you can wait longer. Signed-off-by: Balaji Veeramani --- .../execution/operators/actor_pool_map_operator.py | 7 ++----- python/ray/data/context.py | 7 +++++++ python/ray/data/tests/test_actor_pool_map_operator.py | 7 ++----- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py b/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py index 64a37d495a05..908a938abe80 100644 --- a/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py +++ b/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py @@ -28,10 +28,6 @@ # fairly high since streaming backpressure prevents us from overloading actors. DEFAULT_MAX_TASKS_IN_FLIGHT = 4 -# The default time to wait for minimum requested actors -# to start before raising a timeout, in seconds. -DEFAULT_WAIT_FOR_MIN_ACTORS_SEC = 60 * 10 - class ActorPoolMapOperator(MapOperator): """A MapOperator implementation that executes tasks on an actor pool. @@ -132,7 +128,8 @@ def start(self, options: ExecutionOptions): # upstream operators, leading to a spike in memory usage prior to steady state. logger.debug(f"{self._name}: Waiting for {len(refs)} pool actors to start...") try: - ray.get(refs, timeout=DEFAULT_WAIT_FOR_MIN_ACTORS_SEC) + timeout = DataContext.get_current().wait_for_min_actors_s + ray.get(refs, timeout=timeout) except ray.exceptions.GetTimeoutError: raise ray.exceptions.GetTimeoutError( "Timed out while starting actors. " diff --git a/python/ray/data/context.py b/python/ray/data/context.py index 6c68595cf024..a550c17eb7cb 100644 --- a/python/ray/data/context.py +++ b/python/ray/data/context.py @@ -128,6 +128,10 @@ # calls if the URI is an S3 URI. DEFAULT_S3_TRY_CREATE_DIR = False +DEFAULT_WAIT_FOR_MIN_ACTORS_S = env_integer( + "RAY_DATA_DEFAULT_WAIT_FOR_MIN_ACTORS_S", 60 * 10 +) + def _execution_options_factory() -> "ExecutionOptions": # Lazily import to avoid circular dependencies. @@ -224,6 +228,8 @@ class DataContext: execution starts. s3_try_create_dir: If ``True``, try to create directories on S3 when a write call is made with a S3 URI. + wait_for_min_actors_s: The default time to wait for minimum requested + actors to start before raising a timeout, in seconds. """ target_max_block_size: int = DEFAULT_TARGET_MAX_BLOCK_SIZE @@ -269,6 +275,7 @@ class DataContext: ) print_on_execution_start: bool = True s3_try_create_dir: bool = DEFAULT_S3_TRY_CREATE_DIR + wait_for_min_actors_s: int = DEFAULT_WAIT_FOR_MIN_ACTORS_S def __post_init__(self): # The additonal ray remote args that should be added to diff --git a/python/ray/data/tests/test_actor_pool_map_operator.py b/python/ray/data/tests/test_actor_pool_map_operator.py index 07eebfe0dccb..ee162024d4d9 100644 --- a/python/ray/data/tests/test_actor_pool_map_operator.py +++ b/python/ray/data/tests/test_actor_pool_map_operator.py @@ -515,7 +515,7 @@ def test_locality_manager_busyness_ranking(self): assert res3 is None -def test_start_actor_timeout(ray_start_regular_shared): +def test_start_actor_timeout(ray_start_regular_shared, restore_data_context): """Tests that ActorPoolMapOperator raises an exception on timeout while waiting for actors.""" @@ -523,11 +523,9 @@ class UDFClass: def __call__(self, x): return x - from ray.data._internal.execution.operators import actor_pool_map_operator from ray.exceptions import GetTimeoutError - original_timeout = actor_pool_map_operator.DEFAULT_WAIT_FOR_MIN_ACTORS_SEC - actor_pool_map_operator.DEFAULT_WAIT_FOR_MIN_ACTORS_SEC = 1 + ray.data.DataContext.get_current().wait_for_min_actors_s = 1 with pytest.raises( GetTimeoutError, @@ -544,7 +542,6 @@ def __call__(self, x): compute=ray.data.ActorPoolStrategy(size=5), num_gpus=100, ).take_all() - actor_pool_map_operator.DEFAULT_WAIT_FOR_MIN_ACTORS_SEC = original_timeout if __name__ == "__main__": From 2bd35d76175e22a129566ba1677f08d80eb2b94c Mon Sep 17 00:00:00 2001 From: Cuong Nguyen <128072568+can-anyscale@users.noreply.github.com> Date: Thu, 23 May 2024 08:43:27 -0700 Subject: [PATCH 03/65] Revert "[ci][microcheck] include step id from all step job flavors" (#45514) Reverts ray-project/ray#45403. This seems like not a good idea. Run between different test flavors do not provide a lot of signals, but there are a lot of those so it is notably more expensive. Test: - https://buildkite.com/ray-project/microcheck/builds/488 Signed-off-by: can --- .buildkite/serve.rayci.yml | 2 +- release/ray_release/test.py | 15 +++------------ release/ray_release/tests/test_test.py | 15 ++++++--------- 3 files changed, 10 insertions(+), 22 deletions(-) diff --git a/.buildkite/serve.rayci.yml b/.buildkite/serve.rayci.yml index bc545ed0c5d2..97c7cd14a421 100644 --- a/.buildkite/serve.rayci.yml +++ b/.buildkite/serve.rayci.yml @@ -57,7 +57,7 @@ steps: depends_on: servepydantic1build - label: ":ray-serve: serve: python {{matrix.python}} tests ({{matrix.worker_id}})" - if: build.pull_request.labels includes "continuous-build" || pipeline.id == "0189e759-8c96-4302-b6b5-b4274406bf89" + if: build.pull_request.labels includes "continuous-build" || pipeline.id == "0189e759-8c96-4302-b6b5-b4274406bf89" || pipeline.id == "018f4f1e-1b73-4906-9802-92422e3badaa" tags: - serve - python diff --git a/release/ray_release/test.py b/release/ray_release/test.py index 343e02dda7f1..868adfac0cb5 100644 --- a/release/ray_release/test.py +++ b/release/ray_release/test.py @@ -204,19 +204,10 @@ def gen_high_impact_tests(cls, prefix: str) -> Dict[str, List]: ] step_id_to_tests = {} for test in high_impact_tests: - recent_results = test.get_test_results() - if not recent_results: + step_id = test.get_test_results(limit=1)[0].rayci_step_id + if not step_id: continue - recent_commit = recent_results[0].commit - for result in recent_results: - # consider all results with the same recent commit; this is to make sure - # we will include different job flavors of the same test - if result.commit != recent_commit: - continue - step_id = result.rayci_step_id - if not step_id: - continue - step_id_to_tests[step_id] = step_id_to_tests.get(step_id, []) + [test] + step_id_to_tests[step_id] = step_id_to_tests.get(step_id, []) + [test] return step_id_to_tests diff --git a/release/ray_release/tests/test_test.py b/release/ray_release/tests/test_test.py index 229bb8b4f108..b9a756b5b3ed 100644 --- a/release/ray_release/tests/test_test.py +++ b/release/ray_release/tests/test_test.py @@ -61,11 +61,11 @@ def _stub_test(val: dict) -> Test: def _stub_test_result( - status: ResultStatus = ResultStatus.SUCCESS, rayci_step_id="123", commit="456" + status: ResultStatus = ResultStatus.SUCCESS, rayci_step_id="123" ) -> TestResult: return TestResult( status=status.value, - commit=commit, + commit="1234567890", branch="master", url="url", timestamp=0, @@ -340,7 +340,7 @@ def gen_high_impact_tests(mock_gen_from_s3) -> None: "name": "core_test", Test.KEY_IS_HIGH_IMPACT: "false", "test_results": [ - _stub_test_result(rayci_step_id="corebuild", commit="123"), + _stub_test_result(rayci_step_id="corebuild"), ], } ) @@ -349,9 +349,7 @@ def gen_high_impact_tests(mock_gen_from_s3) -> None: "name": "data_test_01", Test.KEY_IS_HIGH_IMPACT: "true", "test_results": [ - _stub_test_result(rayci_step_id="databuild", commit="123"), - _stub_test_result(rayci_step_id="data15build", commit="123"), - _stub_test_result(rayci_step_id="data12build", commit="456"), + _stub_test_result(rayci_step_id="databuild"), ], } ) @@ -360,7 +358,7 @@ def gen_high_impact_tests(mock_gen_from_s3) -> None: "name": "data_test_02", Test.KEY_IS_HIGH_IMPACT: "true", "test_results": [ - _stub_test_result(rayci_step_id="databuild", commit="789"), + _stub_test_result(rayci_step_id="databuild"), ], } ) @@ -368,8 +366,7 @@ def gen_high_impact_tests(mock_gen_from_s3) -> None: mock_gen_from_s3.return_value = [core_test, data_test_01, data_test_02] assert Test.gen_high_impact_tests("linux") == { - "databuild": [data_test_01, data_test_02], - "data15build": [data_test_01], + "databuild": [data_test_01, data_test_02] } From 4ebbcbf8d884d9b8b839ff96f119061b41c91989 Mon Sep 17 00:00:00 2001 From: Sven Mika Date: Thu, 23 May 2024 18:33:19 +0200 Subject: [PATCH 04/65] [RLlib] Fix Single/MultiAgentEnvRunner missing env-to-module connector call in `_sample_episodes()`. (#45517) --- rllib/algorithms/callbacks.py | 22 ++++++++++--- rllib/env/multi_agent_env_runner.py | 48 ++++++++++++++++++++-------- rllib/env/single_agent_env_runner.py | 27 ++++++++++++++-- 3 files changed, 75 insertions(+), 22 deletions(-) diff --git a/rllib/algorithms/callbacks.py b/rllib/algorithms/callbacks.py index 6e8b61c1c8eb..e0d595be3c17 100644 --- a/rllib/algorithms/callbacks.py +++ b/rllib/algorithms/callbacks.py @@ -384,12 +384,24 @@ def on_episode_end( The exact time of the call of this callback is after `env.step([action])` and also after the results of this step (observation, reward, terminated, truncated, infos) have been logged to the given `episode` object, where either terminated - or truncated were True. + or truncated were True: - Note that on the new API stack, this callback is always preceeded by an - `on_episode_step` call, which comes before the call to this method, but is - provided with the non-finalized episode object (meaning the data has NOT - been converted to numpy arrays yet). + - The env is stepped: `final_obs, rewards, ... = env.step([action])` + + - The step results are logged `episode.add_env_step(final_obs, rewards)` + + - Callback `on_episode_step` is fired. + + - Another env-to-module connector call is made (even though we won't need any + RLModule forward pass anymore). We make this additional call to ensure that in + case users use the connector pipeline to process observations (and write them + back into the episode), the episode object has all observations - even the + terminal one - properly processed. + + - ---> This callback `on_episode_end()` is fired. <--- + + - The episode is finalized (i.e. lists of obs/rewards/actions/etc.. are + converted into numpy arrays). Args: episode: The terminated/truncated SingleAgent- or MultiAgentEpisode object diff --git a/rllib/env/multi_agent_env_runner.py b/rllib/env/multi_agent_env_runner.py index e60300c11a4b..3e76e8b0025b 100644 --- a/rllib/env/multi_agent_env_runner.py +++ b/rllib/env/multi_agent_env_runner.py @@ -320,6 +320,10 @@ def _sample_timesteps( extra_model_outputs=extra_model_outputs, ) + # Make the `on_episode_step` callback (before finalizing the episode + # object). + self._make_on_episode_callback("on_episode_step") + # Episode is done for all agents. Wrap up the old one and create a new # one (and reset it) to continue. if self._episode.is_done: @@ -329,17 +333,20 @@ def _sample_timesteps( # a call and in case the structure of the observations change # sufficiently, the following `finalize()` call on the episode will # fail. - self._env_to_module( - episodes=[self._episode], - explore=explore, - rl_module=self.module, - shared_data=self._shared_data, - ) + if self.module is not None: + self._env_to_module( + episodes=[self._episode], + explore=explore, + rl_module=self.module, + shared_data=self._shared_data, + ) - # Make the `on_episode_step` and `on_episode_end` callbacks (before - # finalizing the episode object). - self._make_on_episode_callback("on_episode_step") + # Make the `on_episode_end` callback (before finalizing the episode, + # but after(!) the last env-to-module connector call has been made. + # -> All obs (even the terminal one) should have been processed now (by + # the connector, if applicable). self._make_on_episode_callback("on_episode_end") + # Finalize (numpy'ize) the episode. self._episode.finalize(drop_zero_len_single_agent_episodes=True) done_episodes_to_return.append(self._episode) @@ -356,10 +363,6 @@ def _sample_timesteps( # Make the `on_episode_start` callback. self._make_on_episode_callback("on_episode_start") - else: - # Make the `on_episode_step` callback. - self._make_on_episode_callback("on_episode_step") - # Already perform env-to-module connector call for next call to # `_sample_timesteps()`. See comment in c'tor for `self._cached_to_module`. if self.module is not None: @@ -531,7 +534,24 @@ def _sample_episodes( # Increase episode count. eps += 1 - # Make `on_episode_end` callback before finalizing the episode. + # We have to perform an extra env-to-module pass here, just in case + # the user's connector pipeline performs (permanent) transforms + # on each observation (including this final one here). Without such + # a call and in case the structure of the observations change + # sufficiently, the following `finalize()` call on the episode will + # fail. + if self.module is not None: + self._env_to_module( + episodes=[_episode], + explore=explore, + rl_module=self.module, + shared_data=_shared_data, + ) + + # Make the `on_episode_end` callback (before finalizing the episode, + # but after(!) the last env-to-module connector call has been made. + # -> All obs (even the terminal one) should have been processed now (by + # the connector, if applicable). self._make_on_episode_callback("on_episode_end") # Finish the episode. diff --git a/rllib/env/single_agent_env_runner.py b/rllib/env/single_agent_env_runner.py index 1482da133bd0..065e70781ae5 100644 --- a/rllib/env/single_agent_env_runner.py +++ b/rllib/env/single_agent_env_runner.py @@ -334,6 +334,10 @@ def _sample_timesteps( truncated=truncateds[env_index], extra_model_outputs=extra_model_output, ) + # Make the `on_episode_step` and `on_episode_end` callbacks (before + # finalizing the episode object). + self._make_on_episode_callback("on_episode_step", env_index) + # We have to perform an extra env-to-module pass here, just in case # the user's connector pipeline performs (permanent) transforms # on each observation (including this final one here). Without such @@ -347,9 +351,7 @@ def _sample_timesteps( rl_module=self.module, shared_data=self._shared_data, ) - # Make the `on_episode_step` and `on_episode_end` callbacks (before - # finalizing the episode object). - self._make_on_episode_callback("on_episode_step", env_index) + self._make_on_episode_callback("on_episode_end", env_index) # Then finalize (numpy'ize) the episode. @@ -524,6 +526,25 @@ def _sample_episodes( self._make_on_episode_callback( "on_episode_step", env_index, episodes ) + + # We have to perform an extra env-to-module pass here, just in case + # the user's connector pipeline performs (permanent) transforms + # on each observation (including this final one here). Without such + # a call and in case the structure of the observations change + # sufficiently, the following `finalize()` call on the episode will + # fail. + if self.module is not None: + self._env_to_module( + episodes=[episodes[env_index]], + explore=explore, + rl_module=self.module, + shared_data=_shared_data, + ) + + # Make the `on_episode_end` callback (before finalizing the episode, + # but after(!) the last env-to-module connector call has been made. + # -> All obs (even the terminal one) should have been processed now + # (by the connector, if applicable). self._make_on_episode_callback( "on_episode_end", env_index, episodes ) From d8f5aeb16ff502c2eccb29def5802eccc00313d3 Mon Sep 17 00:00:00 2001 From: Gene Der Su Date: Thu, 23 May 2024 10:44:16 -0700 Subject: [PATCH 05/65] [Serve] keep serve alive with bad deployment on `serve run --reload cli` (#45483) ## Why are these changes needed? Put the file watching code inside a try-except block to prevent bad deployment shutdown serve. Also added a test to ensure this behavior. ## Related issue number Closes https://github.com/ray-project/ray/issues/45204 ## Checks - [ ] I've signed off every commit(by using the -s flag, i.e., `git commit -s`) in this PR. - [ ] I've run `scripts/format.sh` to lint the changes in this PR. - [ ] I've included any doc changes needed for https://docs.ray.io/en/master/. - [ ] I've added any new APIs to the API Reference. For example, if I added a method in Tune, I've added it in `doc/source/tune/api/` under the corresponding `.rst` file. - [ ] I've made sure the tests are passing. Note that there might be a few flaky tests, see the recent failures at https://flakey-tests.ray.io/ - Testing Strategy - [ ] Unit tests - [ ] Release tests - [ ] This PR is not tested :( --------- Signed-off-by: Gene Su --- python/ray/serve/scripts.py | 28 +++++++++++++++++----------- python/ray/serve/tests/test_cli_2.py | 25 +++++++++++++++++++++---- 2 files changed, 38 insertions(+), 15 deletions(-) diff --git a/python/ray/serve/scripts.py b/python/ray/serve/scripts.py index 4132c9d12fcd..dbad99756546 100644 --- a/python/ray/serve/scripts.py +++ b/python/ray/serve/scripts.py @@ -563,17 +563,23 @@ def run( yield_on_timeout=True, ): if changes: - cli_logger.info( - f"Detected file change in path {watch_dir}. Redeploying app." - ) - # The module needs to be reloaded with `importlib` in order to pick - # up any changes. - app = _private_api.call_app_builder_with_args_if_necessary( - import_attr(import_path, reload_module=True), args_dict - ) - serve.run( - target=app, blocking=True, name=name, route_prefix=route_prefix - ) + try: + # The module needs to be reloaded with `importlib` in order to + # pick up any changes. + app = _private_api.call_app_builder_with_args_if_necessary( + import_attr(import_path, reload_module=True), args_dict + ) + serve.run( + target=app, + blocking=False, + name=name, + route_prefix=route_prefix, + ) + except Exception: + traceback.print_exc() + cli_logger.error( + "Deploying the latest version of the application failed." + ) except KeyboardInterrupt: cli_logger.info("Got KeyboardInterrupt, shutting down...") diff --git a/python/ray/serve/tests/test_cli_2.py b/python/ray/serve/tests/test_cli_2.py index c5f842400b31..3d45ea2ea31a 100644 --- a/python/ray/serve/tests/test_cli_2.py +++ b/python/ray/serve/tests/test_cli_2.py @@ -49,13 +49,17 @@ def ping_endpoint(endpoint: str, params: str = ""): return CONNECTION_ERROR_MSG -def check_app_running(app_name: str): +def check_app_status(app_name: str, expected_status: str): status_response = subprocess.check_output(["serve", "status"]) status = yaml.safe_load(status_response)["applications"] - assert status[app_name]["status"] == "RUNNING" + assert status[app_name]["status"] == expected_status return True +def check_app_running(app_name: str): + return check_app_status(app_name, "RUNNING") + + @pytest.mark.skipif(sys.platform == "win32", reason="File path incorrect on Windows.") def test_status_multi_app(ray_start_stop): """Deploys a multi-app config file and checks their status.""" @@ -813,6 +817,7 @@ def test_run_reload_basic(ray_start_stop, tmp_path): @serve.deployment class MessageDeployment: def __init__(self, msg): + {invalid_suffix} self.msg = msg def __call__(self): @@ -822,9 +827,9 @@ def __call__(self): msg_app = MessageDeployment.bind("Hello {message}!") """ - def write_file(message: str): + def write_file(message: str, invalid_suffix: str = ""): with open(os.path.join(tmp_path, "reload_serve.py"), "w") as f: - code = code_template.format(message=message) + code = code_template.format(invalid_suffix=invalid_suffix, message=message) print(f"Writing updated code:\n{code}") f.write(code) f.flush() @@ -851,6 +856,18 @@ def write_file(message: str): write_file("Updated") wait_for_condition(lambda: ping_endpoint("") == "Hello Updated!", timeout=10) + # Ensure a bad change doesn't shut down serve and serve reports deploy failed. + write_file(message="update1", invalid_suffix="foobar") + wait_for_condition( + condition_predictor=check_app_status, + app_name="default", + expected_status="DEPLOY_FAILED", + ) + + # Ensure the following reload happens as expected. + write_file("Updated2") + wait_for_condition(lambda: ping_endpoint("") == "Hello Updated2!", timeout=10) + p.send_signal(signal.SIGINT) p.wait() assert ping_endpoint("") == CONNECTION_ERROR_MSG From 4fb670b84dc8b52a50a268bf05d3593189568b38 Mon Sep 17 00:00:00 2001 From: Tim Paine <3105306+timkpaine@users.noreply.github.com> Date: Thu, 23 May 2024 23:29:38 +0200 Subject: [PATCH 06/65] Unpin setproctitle, fixes #37727 (#40289) Unpin setproctitle, fixes #37727 Signed-off-by: Tim Paine <3105306+timkpaine@users.noreply.github.com> --- ci/env/install-dependencies.sh | 2 +- python/setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/env/install-dependencies.sh b/ci/env/install-dependencies.sh index 67e7d05bb826..784800bcc2c7 100755 --- a/ci/env/install-dependencies.sh +++ b/ci/env/install-dependencies.sh @@ -514,7 +514,7 @@ install_thirdparty_packages() { fi mkdir -p "${WORKSPACE_DIR}/python/ray/thirdparty_files" RAY_THIRDPARTY_FILES="$(realpath "${WORKSPACE_DIR}/python/ray/thirdparty_files")" - CC=gcc python -m pip install psutil==5.9.6 setproctitle==1.2.2 colorama==0.4.6 --target="${RAY_THIRDPARTY_FILES}" + CC=gcc python -m pip install psutil==5.9.6 "setproctitle>=1.2.2,<1.4" colorama==0.4.6 --target="${RAY_THIRDPARTY_FILES}" } install_dependencies() { diff --git a/python/setup.py b/python/setup.py index 8c1d573f5109..c402875e0f44 100644 --- a/python/setup.py +++ b/python/setup.py @@ -516,7 +516,7 @@ def build(build_python, build_java, build_cpp): # that certain flags will not be passed along such as --user or sudo. # TODO(rkn): Fix this. if not os.getenv("SKIP_THIRDPARTY_INSTALL"): - pip_packages = ["psutil", "setproctitle==1.2.2", "colorama"] + pip_packages = ["psutil", "setproctitle>=1.2.2,<1.4", "colorama"] subprocess.check_call( [ sys.executable, From 48e571157716dcae3584d73b95be6b2208aaf329 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Thu, 23 May 2024 19:14:18 -0700 Subject: [PATCH 07/65] [Data] Clarify that `num_rows_per_file` isn't strict (#45529) The parameter name num_rows_per_file suggests that the resulting files should have exactly the specified number of rows, but that isn't the case. This PR clarifies that the value is more of a hint. Signed-off-by: Balaji Veeramani --- python/ray/data/dataset.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/python/ray/data/dataset.py b/python/ray/data/dataset.py index 960cba0a61e9..76fe82f01022 100644 --- a/python/ray/data/dataset.py +++ b/python/ray/data/dataset.py @@ -2795,6 +2795,8 @@ def write_parquet( arguments for each dataset block. num_rows_per_file: The target number of rows to write to each file. If ``None``, Ray Data writes a system-chosen number of rows to each file. + The specified value is a hint, not a strict limit. Ray Data might write + more or fewer rows to each file. ray_remote_args: Kwargs passed to :meth:`~ray.remote` in the write tasks. concurrency: The maximum number of Ray tasks to run concurrently. Set this to control number of tasks to run concurrently. This doesn't change the @@ -2903,6 +2905,8 @@ def write_json( arguments for each dataset block. num_rows_per_file: The target number of rows to write to each file. If ``None``, Ray Data writes a system-chosen number of rows to each file. + The specified value is a hint, not a strict limit. Ray Data might write + more or fewer rows to each file. ray_remote_args: kwargs passed to :meth:`~ray.remote` in the write tasks. concurrency: The maximum number of Ray tasks to run concurrently. Set this to control number of tasks to run concurrently. This doesn't change the @@ -3083,6 +3087,8 @@ def write_csv( write arguments for each dataset block. num_rows_per_file: The target number of rows to write to each file. If ``None``, Ray Data writes a system-chosen number of rows to each file. + The specified value is a hint, not a strict limit. Ray Data might write + more or fewer rows to each file. ray_remote_args: kwargs passed to :meth:`~ray.remote` in the write tasks. concurrency: The maximum number of Ray tasks to run concurrently. Set this to control number of tasks to run concurrently. This doesn't change the @@ -3183,6 +3189,8 @@ def write_tfrecords( look like. num_rows_per_file: The target number of rows to write to each file. If ``None``, Ray Data writes a system-chosen number of rows to each file. + The specified value is a hint, not a strict limit. Ray Data might write + more or fewer rows to each file. ray_remote_args: kwargs passed to :meth:`~ray.remote` in the write tasks. concurrency: The maximum number of Ray tasks to run concurrently. Set this to control number of tasks to run concurrently. This doesn't change the @@ -3267,6 +3275,8 @@ def write_webdataset( implementation to write each dataset block to a custom output path. num_rows_per_file: The target number of rows to write to each file. If ``None``, Ray Data writes a system-chosen number of rows to each file. + The specified value is a hint, not a strict limit. Ray Data might write + more or fewer rows to each file. ray_remote_args: Kwargs passed to ``ray.remote`` in the write tasks. concurrency: The maximum number of Ray tasks to run concurrently. Set this to control number of tasks to run concurrently. This doesn't change the @@ -3354,6 +3364,8 @@ def write_numpy( look like. num_rows_per_file: The target number of rows to write to each file. If ``None``, Ray Data writes a system-chosen number of rows to each file. + The specified value is a hint, not a strict limit. Ray Data might write + more or fewer rows to each file. ray_remote_args: kwargs passed to :meth:`~ray.remote` in the write tasks. concurrency: The maximum number of Ray tasks to run concurrently. Set this to control number of tasks to run concurrently. This doesn't change the From 4851df7019112160badf2b6a1e384ab3a6909681 Mon Sep 17 00:00:00 2001 From: Hongchao Deng Date: Thu, 23 May 2024 21:38:40 -0700 Subject: [PATCH 08/65] [core] log dedup should not dedup number only lines (#45485) Signed-off-by: hongchaodeng --- python/ray/_private/ray_logging.py | 9 +++-- python/ray/tests/test_log_dedup.py | 54 ++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/python/ray/_private/ray_logging.py b/python/ray/_private/ray_logging.py index e9ee12195340..5699251e31ce 100644 --- a/python/ray/_private/ray_logging.py +++ b/python/ray/_private/ray_logging.py @@ -284,11 +284,16 @@ def deduplicate(self, batch: LogBatch) -> List[LogBatch]: continue dedup_key = _canonicalise_log_line(line) + if dedup_key == "": + # Don't dedup messages that are empty after canonicalization. + # Because that's all the information users want to see. + output[0]["lines"].append(line) + continue + if dedup_key in self.recent: sources = self.recent[dedup_key].sources sources.add(source) - # We deduplicate the warnings/errorm essages from - # raylet by default. + # We deduplicate the warnings/error messages from raylet by default. if len(sources) > 1 or batch["pid"] == "raylet": state = self.recent[dedup_key] self.recent[dedup_key] = DedupState( diff --git a/python/ray/tests/test_log_dedup.py b/python/ray/tests/test_log_dedup.py index c1353d7c9458..833af274bea0 100644 --- a/python/ray/tests/test_log_dedup.py +++ b/python/ray/tests/test_log_dedup.py @@ -19,6 +19,60 @@ def test_nodedup_logs_single_process(): assert out1 == [batch1] +def test_nodedup_logs_buffer_only_lines(): + now = 142300000.0 + + def gettime(): + return now + + dedup = LogDeduplicator(5, None, None, _timesource=gettime) + batch1 = { + "ip": "node1", + "pid": 100, + # numbers are canonicalised, so this would lead to empty dedup_key + "lines": ["1"], + } + + # Immediately prints always. + out1 = dedup.deduplicate(batch1) + assert out1 == [batch1] + + now += 1.0 + + # Should print new lines even if it is number only again + batch2 = { + "ip": "node2", + "pid": 200, + "lines": ["2"], + } + out2 = dedup.deduplicate(batch2) + assert out2 == [ + { + "ip": "node2", + "pid": 200, + "lines": ["2"], + } + ] + + now += 3.0 + + # Should print new lines even if it is same number + batch3 = { + "ip": "node3", + "pid": 300, + "lines": ["2"], + } + # Should buffer duplicates. + out3 = dedup.deduplicate(batch3) + assert out3 == [ + { + "ip": "node3", + "pid": 300, + "lines": ["2"], + } + ] + + def test_dedup_logs_multiple_processes(): now = 142300000.0 From 7fb0ce1edc3b53a82257d57a715b042d07045ad9 Mon Sep 17 00:00:00 2001 From: simonsays1980 Date: Fri, 24 May 2024 10:32:49 +0200 Subject: [PATCH 09/65] [RLlib] Add support for multi-agent off-policy algorithms in the new API stack. (#45182) --- .bazelrc | 1 + rllib/BUILD | 19 ++ rllib/algorithms/dqn/dqn.py | 5 +- rllib/algorithms/dqn/dqn_rainbow_learner.py | 6 +- .../dqn/torch/dqn_rainbow_torch_learner.py | 2 +- rllib/algorithms/sac/sac.py | 1 + rllib/algorithms/sac/sac_learner.py | 1 - .../algorithms/sac/torch/sac_torch_learner.py | 9 +- .../common/agent_to_module_mapping.py | 3 - .../common/batch_individual_items.py | 5 +- rllib/env/multi_agent_env_runner.py | 9 +- rllib/env/multi_agent_episode.py | 24 ++ .../sac/multi_agent_pendulum_sac.py | 79 +++++ rllib/utils/metrics/__init__.py | 1 + .../multi_agent_episode_replay_buffer.py | 272 ++++++++---------- .../prioritized_episode_replay_buffer.py | 1 + .../test_multi_agent_episode_replay_buffer.py | 64 ++--- rllib/utils/replay_buffers/utils.py | 5 +- 18 files changed, 299 insertions(+), 208 deletions(-) create mode 100644 rllib/tuned_examples/sac/multi_agent_pendulum_sac.py diff --git a/.bazelrc b/.bazelrc index 1637085f31f5..307e9e38dcfd 100644 --- a/.bazelrc +++ b/.bazelrc @@ -167,6 +167,7 @@ test:ci --flaky_test_attempts=3 test:ci --nocache_test_results test:ci --spawn_strategy=local test:ci --test_output=errors +test:ci --experimental_ui_max_stdouterr_bytes=-1 test:ci --test_verbose_timeout_warnings test:ci-debug -c dbg test:ci-debug --copt="-g" diff --git a/rllib/BUILD b/rllib/BUILD index 9333e7b1adeb..5af156d68c77 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -463,6 +463,25 @@ py_test( args = ["--dir=tuned_examples/sac"] ) +# TODO (simon): These tests are not learning, yet. +# py_test( +# name = "learning_tests_multi_agent_pendulum_sac", +# main = "tuned_examples/sac/multi_agent_pendulum_sac.py", +# tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_pendulum", "learning_tests_continuous"], +# size = "large", +# srcs = ["tuned_examples/sac/multi_agent_pendulum_sac.py"], +# args = ["--enable-new-api-stack", "--num-agents=2"] +# ) + +# py_test( +# name = "learning_tests_multi_agent_pendulum_sac_multi_gpu", +# main = "tuned_examples/sac/multi_agent_pendulum_sac.py", +# tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_pendulum", "learning_tests_continuous", "multi_gpu"], +# size = "large", +# srcs = ["tuned_examples/sac/multi_agent_pendulum_sac.py"], +# args = ["--enable-new-api-stack", "--num-agents=2", "--num-gpus=2"] +# ) + # -------------------------------------------------------------------- # Algorithms (Compilation, Losses, simple functionality tests) # rllib/algorithms/ diff --git a/rllib/algorithms/dqn/dqn.py b/rllib/algorithms/dqn/dqn.py index 78502085ff51..2368b733376f 100644 --- a/rllib/algorithms/dqn/dqn.py +++ b/rllib/algorithms/dqn/dqn.py @@ -16,7 +16,6 @@ from ray.rllib.algorithms.algorithm import Algorithm from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided -from ray.rllib.algorithms.dqn.dqn_rainbow_learner import TD_ERROR_KEY from ray.rllib.algorithms.dqn.dqn_tf_policy import DQNTFPolicy from ray.rllib.algorithms.dqn.dqn_torch_policy import DQNTorchPolicy from ray.rllib.core.learner import Learner @@ -64,6 +63,7 @@ REPLAY_BUFFER_UPDATE_PRIOS_TIMER, SAMPLE_TIMER, SYNCH_WORKER_WEIGHTS_TIMER, + TD_ERROR_KEY, TIMERS, ) from ray.rllib.utils.deprecation import DEPRECATED_VALUE @@ -662,7 +662,7 @@ def _training_step_new_api_stack(self, *, with_noise_reset) -> ResultDict: num_items=self.config.train_batch_size, n_step=self.config.n_step, gamma=self.config.gamma, - beta=self.config.replay_buffer_config["beta"], + beta=self.config.replay_buffer_config.get("beta"), ) # Perform an update on the buffer-sampled train batch. @@ -700,6 +700,7 @@ def _training_step_new_api_stack(self, *, with_noise_reset) -> ResultDict: }, reduce="sum", ) + # TODO (sven): Uncomment this once agent steps are available in the # Learner stats. # self.metrics.log_dict(self.metrics.peek( diff --git a/rllib/algorithms/dqn/dqn_rainbow_learner.py b/rllib/algorithms/dqn/dqn_rainbow_learner.py index 1aba7f757008..abc73ada8413 100644 --- a/rllib/algorithms/dqn/dqn_rainbow_learner.py +++ b/rllib/algorithms/dqn/dqn_rainbow_learner.py @@ -13,7 +13,10 @@ override, OverrideToImplementCustomLogic_CallToSuperRecommended, ) -from ray.rllib.utils.metrics import LAST_TARGET_UPDATE_TS, NUM_TARGET_UPDATES +from ray.rllib.utils.metrics import ( + LAST_TARGET_UPDATE_TS, + NUM_TARGET_UPDATES, +) from ray.rllib.utils.typing import ModuleID if TYPE_CHECKING: @@ -32,7 +35,6 @@ QF_TARGET_NEXT_PROBS = "qf_target_next_probs" QF_PREDS = "qf_preds" QF_PROBS = "qf_probs" -TD_ERROR_KEY = "td_error" TD_ERROR_MEAN_KEY = "td_error_mean" diff --git a/rllib/algorithms/dqn/torch/dqn_rainbow_torch_learner.py b/rllib/algorithms/dqn/torch/dqn_rainbow_torch_learner.py index 0a887908b114..7ec354ba61f9 100644 --- a/rllib/algorithms/dqn/torch/dqn_rainbow_torch_learner.py +++ b/rllib/algorithms/dqn/torch/dqn_rainbow_torch_learner.py @@ -14,13 +14,13 @@ QF_TARGET_NEXT_PROBS, QF_PREDS, QF_PROBS, - TD_ERROR_KEY, TD_ERROR_MEAN_KEY, ) from ray.rllib.core.columns import Columns from ray.rllib.core.learner.torch.torch_learner import TorchLearner from ray.rllib.utils.annotations import override from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.metrics import TD_ERROR_KEY from ray.rllib.utils.nested_dict import NestedDict from ray.rllib.utils.typing import ModuleID, TensorType diff --git a/rllib/algorithms/sac/sac.py b/rllib/algorithms/sac/sac.py index e249e2dae7a3..8ab0c89cf3ad 100644 --- a/rllib/algorithms/sac/sac.py +++ b/rllib/algorithms/sac/sac.py @@ -352,6 +352,7 @@ def validate(self) -> None: ] not in [ "EpisodeReplayBuffer", "PrioritizedEpisodeReplayBuffer", + "MultiAgentEpisodeReplayBuffer", ]: raise ValueError( "When using the new `EnvRunner API` the replay buffer must be of type " diff --git a/rllib/algorithms/sac/sac_learner.py b/rllib/algorithms/sac/sac_learner.py index 94a2e907de96..cdbc44ee749b 100644 --- a/rllib/algorithms/sac/sac_learner.py +++ b/rllib/algorithms/sac/sac_learner.py @@ -20,7 +20,6 @@ QF_TWIN_LOSS_KEY = "qf_twin_loss" QF_TWIN_PREDS = "qf_twin_preds" TD_ERROR_MEAN_KEY = "td_error_mean" -TD_ERROR_KEY = "td_error" class SACLearner(DQNRainbowLearner): diff --git a/rllib/algorithms/sac/torch/sac_torch_learner.py b/rllib/algorithms/sac/torch/sac_torch_learner.py index 0565e950136b..229b8cc4549f 100644 --- a/rllib/algorithms/sac/torch/sac_torch_learner.py +++ b/rllib/algorithms/sac/torch/sac_torch_learner.py @@ -15,17 +15,15 @@ QF_TWIN_LOSS_KEY, QF_TWIN_PREDS, TD_ERROR_MEAN_KEY, - TD_ERROR_KEY, SACLearner, ) -from ray.rllib.core import DEFAULT_MODULE_ID from ray.rllib.core.columns import Columns from ray.rllib.core.learner.learner import ( POLICY_LOSS_KEY, ) from ray.rllib.utils.annotations import override from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.metrics import ALL_MODULES +from ray.rllib.utils.metrics import ALL_MODULES, TD_ERROR_KEY from ray.rllib.utils.nested_dict import NestedDict from ray.rllib.utils.typing import ModuleID, ParamDict, TensorType @@ -221,8 +219,6 @@ def compute_loss_for_module( # Note further, we use here the Huber loss instead of the mean squared error # as it improves training performance. critic_loss = torch.mean( - # TODO (simon): Introduce priority weights when episode buffer is ready. - # batch[PRIO_WEIGHTS] * batch["weights"] * torch.nn.HuberLoss(reduction="none", delta=1.0)( q_selected, q_selected_target @@ -303,6 +299,7 @@ def compute_loss_for_module( def compute_gradients( self, loss_per_module: Dict[str, TensorType], **kwargs ) -> ParamDict: + # Set all grads to `None`. for optim in self._optimizer_parameters: optim.zero_grad(set_to_none=True) @@ -317,7 +314,7 @@ def compute_gradients( for component in ( ["qf", "policy", "alpha"] + ["qf_twin"] if config.twin_q else [] ): - self.metrics.peek(DEFAULT_MODULE_ID, component + "_loss").backward( + self.metrics.peek(module_id, component + "_loss").backward( retain_graph=True ) grads.update( diff --git a/rllib/connectors/common/agent_to_module_mapping.py b/rllib/connectors/common/agent_to_module_mapping.py index c304fa60a174..b54a20bb050f 100644 --- a/rllib/connectors/common/agent_to_module_mapping.py +++ b/rllib/connectors/common/agent_to_module_mapping.py @@ -133,9 +133,6 @@ def __call__( shared_data: Optional[dict] = None, **kwargs, ) -> Any: - # This Connector should only be used in a multi-agent setting. - assert not episodes or isinstance(episodes[0], MultiAgentEpisode) - # Current agent to module mapping function. # agent_to_module_mapping_fn = shared_data.get("agent_to_module_mapping_fn") # Store in shared data, which module IDs map to which episode/agent, such diff --git a/rllib/connectors/common/batch_individual_items.py b/rllib/connectors/common/batch_individual_items.py index e4a4f2ac8d86..9b5460b4cb49 100644 --- a/rllib/connectors/common/batch_individual_items.py +++ b/rllib/connectors/common/batch_individual_items.py @@ -33,7 +33,10 @@ def __call__( # to a batch structure of: # [module_id] -> [col0] -> [list of items] if is_marl_module and column in rl_module: - assert is_multi_agent + # assert is_multi_agent + # TODO (simon, sven): Check, if we need for other cases this check. + # If MA Off-Policy and independent sampling we need to overcome + # this check. module_data = column_data for col, col_data in module_data.copy().items(): if isinstance(col_data, list) and col != Columns.INFOS: diff --git a/rllib/env/multi_agent_env_runner.py b/rllib/env/multi_agent_env_runner.py index 3e76e8b0025b..043083aaa11a 100644 --- a/rllib/env/multi_agent_env_runner.py +++ b/rllib/env/multi_agent_env_runner.py @@ -3,7 +3,6 @@ from collections import defaultdict from functools import partial -import numpy as np from typing import DefaultDict, Dict, List, Optional from ray.rllib.algorithms.algorithm_config import AlgorithmConfig @@ -623,9 +622,11 @@ def get_metrics(self) -> ResultDict: module_episode_returns, ) - # If no episodes at all, log NaN stats. - if len(self._done_episodes_for_metrics) == 0: - self._log_episode_metrics(np.nan, np.nan, np.nan) + # TODO (simon): This results in hundreds of warnings in the logs + # b/c reducing over NaNs is not supported. + # # If no episodes at all, log NaN stats. + # if len(self._done_episodes_for_metrics) == 0: + # self._log_episode_metrics(np.nan, np.nan, np.nan) # Log num episodes counter for this iteration. self.metrics.log_value( diff --git a/rllib/env/multi_agent_episode.py b/rllib/env/multi_agent_episode.py index 216aaf5f0f31..a59cec2d7a63 100644 --- a/rllib/env/multi_agent_episode.py +++ b/rllib/env/multi_agent_episode.py @@ -58,6 +58,30 @@ class MultiAgentEpisode: up to here, b/c there is nothing to learn from these "premature" rewards. """ + __slots__ = ( + "id_", + "agent_to_module_mapping_fn", + "_agent_to_module_mapping", + "observation_space", + "action_space", + "env_t_started", + "env_t", + "agent_t_started", + "env_t_to_agent_t", + "_hanging_actions_end", + "_hanging_extra_model_outputs_end", + "_hanging_rewards_end", + "_hanging_actions_begin", + "_hanging_extra_model_outputs_begin", + "_hanging_rewards_begin", + "is_terminated", + "is_truncated", + "agent_episodes", + "_temporary_timestep_data", + "_start_time", + "_last_step_time", + ) + SKIP_ENV_TS_TAG = "S" def __init__( diff --git a/rllib/tuned_examples/sac/multi_agent_pendulum_sac.py b/rllib/tuned_examples/sac/multi_agent_pendulum_sac.py new file mode 100644 index 000000000000..a70ca0b9d62b --- /dev/null +++ b/rllib/tuned_examples/sac/multi_agent_pendulum_sac.py @@ -0,0 +1,79 @@ +from ray.rllib.algorithms.sac import SACConfig +from ray.rllib.examples.envs.classes.multi_agent import MultiAgentPendulum +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + NUM_ENV_STEPS_SAMPLED_LIFETIME, +) +from ray.tune.registry import register_env + +from ray.rllib.utils.test_utils import add_rllib_example_script_args + +parser = add_rllib_example_script_args() +# Use `parser` to add your own custom command line options to this script +# and (if needed) use their values to set up `config` below. +args = parser.parse_args() + +register_env( + "multi_agent_pendulum", + lambda _: MultiAgentPendulum({"num_agents": args.num_agents or 2}), +) + +config = ( + SACConfig() + .environment(env="multi_agent_pendulum") + .rl_module( + model_config_dict={ + "fcnet_hiddens": [256, 256], + "fcnet_activation": "relu", + "post_fcnet_hiddens": [], + "post_fcnet_activation": None, + "post_fcnet_weights_initializer": "orthogonal_", + "post_fcnet_weights_initializer_config": {"gain": 0.01}, + } + ) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) + .env_runners( + rollout_fragment_length=1, + num_env_runners=2, + num_envs_per_env_runner=1, + ) + .training( + initial_alpha=1.001, + lr=3e-4, + target_entropy="auto", + n_step=1, + tau=0.005, + train_batch_size_per_learner=256, + target_network_update_freq=1, + replay_buffer_config={ + "type": "MultiAgentEpisodeReplayBuffer", + "capacity": 100000, + }, + num_steps_sampled_before_learning_starts=256, + ) + .reporting( + metrics_num_episodes_for_smoothing=5, + min_sample_timesteps_per_iteration=1000, + ) +) + +if args.num_agents: + config.multi_agent( + policy_mapping_fn=lambda aid, *arg, **kw: f"p{aid}", + policies={f"p{i}" for i in range(args.num_agents)}, + ) + +stop = { + NUM_ENV_STEPS_SAMPLED_LIFETIME: 500000, + # `episode_return_mean` is the sum of all agents/policies' returns. + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": -400.0 * (args.num_agents or 2), +} + +if __name__ == "__main__": + from ray.rllib.utils.test_utils import run_rllib_example_script_experiment + + run_rllib_example_script_experiment(config, args, stop=stop) diff --git a/rllib/utils/metrics/__init__.py b/rllib/utils/metrics/__init__.py index 164ec20cf405..39e087da9434 100644 --- a/rllib/utils/metrics/__init__.py +++ b/rllib/utils/metrics/__init__.py @@ -91,3 +91,4 @@ # Learner. LEARNER_STATS_KEY = "learner_stats" ALL_MODULES = "__all_modules__" +TD_ERROR_KEY = "td_error" diff --git a/rllib/utils/replay_buffers/multi_agent_episode_replay_buffer.py b/rllib/utils/replay_buffers/multi_agent_episode_replay_buffer.py index c32415ba01a9..9b1af8e86cff 100644 --- a/rllib/utils/replay_buffers/multi_agent_episode_replay_buffer.py +++ b/rllib/utils/replay_buffers/multi_agent_episode_replay_buffer.py @@ -3,10 +3,11 @@ from gymnasium.core import ActType, ObsType import numpy as np import scipy -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Set, Tuple, Union from ray.rllib.core.columns import Columns from ray.rllib.env.multi_agent_episode import MultiAgentEpisode +from ray.rllib.env.single_agent_episode import SingleAgentEpisode from ray.rllib.utils.replay_buffers.episode_replay_buffer import EpisodeReplayBuffer from ray.rllib.utils import force_list from ray.rllib.utils.annotations import override, DeveloperAPI @@ -130,40 +131,42 @@ def add( """ episodes: List["MultiAgentEpisode"] = force_list(episodes) - new_episode_ids: List[str] = [] - for eps in episodes: - new_episode_ids.append(eps.id_) - self._num_timesteps += eps.env_steps() - self._num_timesteps_added += eps.env_steps() + new_episode_ids: List[str] = {eps.id_ for eps in episodes} + total_env_timesteps = sum([eps.env_steps() for eps in episodes]) + self._num_timesteps += total_env_timesteps + self._num_timesteps_added += total_env_timesteps # Evict old episodes. - eps_evicted: List["MultiAgentEpisode"] = [] - eps_evicted_ids: List[Union[str, int]] = [] - eps_evicted_idxs: List[int] = [] + eps_evicted_ids: Set[Union[str, int]] = set() + eps_evicted_idxs: Set[int] = set() while ( self._num_timesteps > self.capacity and self._num_remaining_episodes(new_episode_ids, eps_evicted_ids) != 1 ): # Evict episode. evicted_episode = self.episodes.popleft() - eps_evicted.append(evicted_episode) - eps_evicted_ids.append(evicted_episode.id_) - eps_evicted_idxs.append(self.episode_id_to_index.pop(evicted_episode.id_)) + eps_evicted_ids.add(evicted_episode.id_) + eps_evicted_idxs.add(self.episode_id_to_index.pop(evicted_episode.id_)) # If this episode has a new chunk in the new episodes added, # we subtract it again. # TODO (sven, simon): Should we just treat such an episode chunk # as a new episode? if evicted_episode.id_ in new_episode_ids: - new_eps_to_evict = episodes[new_episode_ids.index(evicted_episode.id_)] + idx = next( + i + for i, eps in enumerate(episodes) + if eps.id_ == evicted_episode.id_ + ) + new_eps_to_evict = episodes.pop(idx) self._num_timesteps -= new_eps_to_evict.env_steps() self._num_timesteps_added -= new_eps_to_evict.env_steps() - episodes.remove(new_eps_to_evict) # Remove the timesteps of the evicted episode from the counter. self._num_timesteps -= evicted_episode.env_steps() self._num_agent_timesteps -= evicted_episode.agent_steps() self._num_episodes_evicted += 1 # Remove the module timesteps of the evicted episode from the counters. self._evict_module_episodes(evicted_episode) + del evicted_episode # Add agent and module steps. for eps in episodes: @@ -174,41 +177,38 @@ def add( # Remove corresponding indices, if episodes were evicted. if eps_evicted_idxs: - new_indices = [] - # Each index 2-tuple is of the form (ma_episode_idx, timestep) and + # If the episode is not exvicted, we keep the index. + # Note, ach index 2-tuple is of the form (ma_episode_idx, timestep) and # refers to a certain environment timestep in a certain multi-agent # episode. - for idx_tuple in self._indices: - # If episode index is not from an evicted episode, keep it. - if idx_tuple[0] not in eps_evicted_idxs: - new_indices.append(idx_tuple) - # Assign the new list of indices. - self._indices = new_indices + self._indices = [ + idx_tuple + for idx_tuple in self._indices + if idx_tuple[0] not in eps_evicted_idxs + ] # Also remove corresponding module indices. for module_id, module_indices in self._module_to_indices.items(): - new_module_indices = [] # Each index 3-tuple is of the form # (ma_episode_idx, agent_id, timestep) and refers to a certain # agent timestep in a certain multi-agent episode. - for idx_triplet in module_indices: - if idx_triplet[0] not in eps_evicted_idxs: - new_module_indices.append(idx_triplet) - self._module_to_indices[module_id] = new_module_indices + self._module_to_indices[module_id] = [ + idx_triplet + for idx_triplet in module_indices + if idx_triplet[0] not in eps_evicted_idxs + ] for eps in episodes: eps = copy.deepcopy(eps) # If the episode is part of an already existing episode, concatenate. if eps.id_ in self.episode_id_to_index: eps_idx = self.episode_id_to_index[eps.id_] - existing_eps = self.episodes[eps_idx] + existing_eps = self.episodes[eps_idx - self._num_episodes_evicted] existing_len = len(existing_eps) self._indices.extend( [ ( eps_idx, - # Note, we add 1 b/c the first timestep is - # never sampled. - existing_len + i + 1, + existing_len + i, ) for i in range(len(eps)) ] @@ -223,7 +223,7 @@ def add( self.episodes.append(eps) eps_idx = len(self.episodes) - 1 + self._num_episodes_evicted self.episode_id_to_index[eps.id_] = eps_idx - self._indices.extend([(eps_idx, i + 1) for i in range(len(eps))]) + self._indices.extend([(eps_idx, i) for i in range(len(eps))]) # Add new module indices. self._add_new_module_indices(eps, eps_idx, False) @@ -240,6 +240,7 @@ def sample( include_extra_model_outputs: bool = False, replay_mode: str = "independent", modules_to_sample: Optional[List[ModuleID]] = None, + **kwargs, ) -> SampleBatchType: """Samples a batch of multi-agent transitions. @@ -458,46 +459,25 @@ def _sample_independent( gamma: float, include_infos: bool, include_extra_model_outputs: bool, - modules_to_sample: Optional[List[ModuleID]], + modules_to_sample: Optional[Set[ModuleID]], ) -> SampleBatchType: """Samples a batch of independent multi-agent transitions.""" + + actual_n_step = n_step or 1 # Sample the n-step if necessary. - if isinstance(n_step, tuple): - # Use random n-step sampling. - random_n_step = True - else: - actual_n_step = n_step or 1 - random_n_step = False + random_n_step = isinstance(n_step, tuple) - ret = {} + sampled_episodes = [] # TODO (simon): Ensure that the module has data and if not, skip it. # TODO (sven): Should we then error out or skip? I think the Learner # should handle this case when a module has no train data. - for module_id in modules_to_sample or self._module_to_indices.keys(): - # Rows to return. - observations: List[List[ObsType]] = [[] for _ in range(batch_size_B)] - next_observations: List[List[ObsType]] = [[] for _ in range(batch_size_B)] - actions: List[List[ActType]] = [[] for _ in range(batch_size_B)] - rewards: List[List[float]] = [[] for _ in range(batch_size_B)] - is_terminated: List[bool] = [False for _ in range(batch_size_B)] - is_truncated: List[bool] = [False for _ in range(batch_size_B)] - weights: List[float] = [[1.0] for _ in range(batch_size_B)] - n_steps: List[List[int]] = [[] for _ in range(batch_size_B)] - # If `info` should be included, construct also a container for them. - if include_infos: - infos: List[List[Dict[str, Any]]] = [[] for _ in range(batch_size_B)] - # If `extra_model_outputs` should be included, construct a container for - # them. - if include_extra_model_outputs: - extra_model_outputs: List[List[Dict[str, Any]]] = [ - [] for _ in range(batch_size_B) - ] + modules_to_sample = modules_to_sample or set(self._module_to_indices.keys()) + for module_id in modules_to_sample: + module_indices = self._module_to_indices[module_id] B = 0 while B < batch_size_B: # Now sample from the single-agent timesteps. - index_tuple = self._module_to_indices[module_id][ - self.rng.integers(len(self._module_to_indices[module_id])) - ] + index_tuple = module_indices[self.rng.integers(len(module_indices))] # This will be an agent timestep (not env timestep). # TODO (simon, sven): Maybe deprecate sa_episode_idx (_) in the index @@ -507,109 +487,95 @@ def _sample_independent( index_tuple[1], index_tuple[2], ) - # If we cannnot make the n-step, we resample. - if sa_episode_ts - n_step < 0: - continue - # If we use random n-step sampling, draw the n-step for this item. - if random_n_step: - actual_n_step = int(self.rng.integers(n_step[0], n_step[1])) - # If we are at the end of an episode, continue. - # Note, priority sampling got us `o_(t+n)` and we need for the loss - # calculation in addition `o_t`. - # TODO (simon): Maybe introduce a variable `num_retries` until the - # while loop should break when not enough samples have been collected - # to make n-step possible. - if sa_episode_ts - actual_n_step < 0: - continue - else: - n_steps[B] = actual_n_step + # Get the multi-agent episode. ma_episode = self.episodes[ma_episode_idx] # Retrieve the single-agent episode for filtering. sa_episode = ma_episode.agent_episodes[agent_id] - # Ensure that each row contains a tuple of the form: - # (o_t, a_t, sum(r_(t:t+n_step)), o_(t+n_step)) - # TODO (simon): Implement version for sequence sampling when using RNNs. - sa_eps_observation = sa_episode.get_observations( - slice(sa_episode_ts - actual_n_step, sa_episode_ts + 1) - ) - # Note, the reward that is collected by transitioning from `o_t` to - # `o_(t+1)` is stored in the next transition in `SingleAgentEpisode`. - sa_eps_rewards = sa_episode.get_rewards( - slice(sa_episode_ts - actual_n_step, sa_episode_ts) - ) - observations[B] = sa_eps_observation[0] - next_observations[B] = sa_eps_observation[-1] + + # If we use random n-step sampling, draw the n-step for this item. + if random_n_step: + actual_n_step = int(self.rng.integers(n_step[0], n_step[1])) + # If we cannnot make the n-step, we resample. + if sa_episode_ts + actual_n_step > len(sa_episode): + continue # Note, this will be the reward after executing action - # `a_(episode_ts-n_step+1)`. For `n_step>1` this will be the sum of + # `a_(episode_ts)`. For `n_step>1` this will be the sum of # all rewards that were collected over the last n steps. - rewards[B] = scipy.signal.lfilter( - [1], [1, -gamma], sa_eps_rewards[::-1], axis=0 + sa_raw_rewards = sa_episode.get_rewards( + slice(sa_episode_ts, sa_episode_ts + actual_n_step) + ) + sa_rewards = scipy.signal.lfilter( + [1], [1, -gamma], sa_raw_rewards[::-1], axis=0 )[-1] - # Note, `SingleAgentEpisode` stores the action that followed - # `o_t` with `o_(t+1)`, therefore, we need the next one. - # TODO (simon): This gets the wrong action as long as the getters are - # not fixed. - actions[B] = sa_episode.get_actions(sa_episode_ts - actual_n_step) - if include_infos: - # If infos are included we include the ones from the last timestep - # as usually the info contains additional values about the last - # state. - infos[B] = sa_episode.get_infos(sa_episode_ts) - if include_extra_model_outputs: - # If `extra_model_outputs` are included we include the ones from the - # first timestep as usually the `extra_model_outputs` contain - # additional values from the forward pass that produced the action - # at the first timestep. - # Note, we extract them into single row dictionaries similar to the - # infos, in a connector we can then extract these into single batch - # rows. - extra_model_outputs[B] = { - k: sa_episode.get_extra_model_outputs( - k, sa_episode_ts - actual_n_step - ) - for k in sa_episode.extra_model_outputs.keys() - } - # If the sampled time step is the episode's last time step check, if - # the episode is terminated or truncated. - if sa_episode_ts == sa_episode.t: - is_terminated[B] = sa_episode.is_terminated - is_truncated[B] = sa_episode.is_truncated + + sampled_sa_episode = SingleAgentEpisode( + id_=sa_episode.id_, + # Provide the IDs for the learner connector. + agent_id=sa_episode.agent_id, + module_id=sa_episode.module_id, + multi_agent_episode_id=ma_episode.id_, + # Ensure that each episode contains a tuple of the form: + # (o_t, a_t, sum(r_(t:t+n_step)), o_(t+n_step)) + # Two observations (t and t+n). + observations=[ + sa_episode.get_observations(sa_episode_ts), + sa_episode.get_observations(sa_episode_ts + actual_n_step), + ], + observation_space=sa_episode.observation_space, + infos=( + [ + sa_episode.get_infos(sa_episode_ts), + sa_episode.get_infos(sa_episode_ts + actual_n_step), + ] + if include_infos + else None + ), + actions=[sa_episode.get_actions(sa_episode_ts)], + action_space=sa_episode.action_space, + rewards=[sa_rewards], + # If the sampled single-agent episode is the single-agent episode's + # last time step, check, if the single-agent episode is terminated + # or truncated. + terminated=( + sa_episode_ts + actual_n_step >= len(sa_episode) + and sa_episode.is_terminated + ), + truncated=( + sa_episode_ts + actual_n_step >= len(sa_episode) + and sa_episode.is_truncated + ), + extra_model_outputs={ + "weights": [1.0], + "n_step": [actual_n_step], + **( + { + k: [ + sa_episode.get_extra_model_outputs(k, sa_episode_ts) + ] + for k in sa_episode.extra_model_outputs.keys() + } + if include_extra_model_outputs + else {} + ), + }, + # TODO (sven): Support lookback buffers. + len_lookback_buffer=0, + t_started=sa_episode_ts, + ) + # Append single-agent episode to the list of sampled episodes. + sampled_episodes.append(sampled_sa_episode) # Increase counter. B += 1 # Increase the per module timesteps counter. - self.sampled_timesteps_per_module[module_id] += batch_size_B - ret[module_id] = { - # Note, observation and action spaces could be complex. `batch` - # takes care of these. - Columns.OBS: batch(observations), - Columns.ACTIONS: batch(actions), - Columns.REWARDS: np.array(rewards), - Columns.NEXT_OBS: batch(next_observations), - Columns.TERMINATEDS: np.array(is_terminated), - Columns.TRUNCATEDS: np.array(is_truncated), - "weights": np.array(weights), - "n_step": np.array(n_steps), - } - # Include infos if necessary. - if include_infos: - ret[module_id].update( - { - Columns.INFOS: infos, - } - ) - # Include extra model outputs, if necessary. - if include_extra_model_outputs: - ret[module_id].update( - # These could be complex, too. - batch(extra_model_outputs) - ) + self.sampled_timesteps_per_module[module_id] += B + # Increase the counter for environment timesteps. self.sampled_timesteps += batch_size_B # Return multi-agent dictionary. - return ret + return sampled_episodes def _sample_synchonized( self, @@ -899,7 +865,9 @@ def _add_new_module_indices( sa_episode_in_buffer = False if sa_episode_in_buffer: existing_eps_len = len( - self.episodes[episode_idx].agent_episodes[agent_id] + self.episodes[ + episode_idx - self._num_episodes_evicted + ].agent_episodes[agent_id] ) else: existing_eps_len = 0 @@ -910,7 +878,7 @@ def _add_new_module_indices( # Keep the MAE index for sampling episode_idx, agent_id, - existing_eps_len + i + 1, + existing_eps_len + i, ) for i in range(len(module_eps)) ] diff --git a/rllib/utils/replay_buffers/prioritized_episode_replay_buffer.py b/rllib/utils/replay_buffers/prioritized_episode_replay_buffer.py index 38690232351e..b2a656d75f06 100644 --- a/rllib/utils/replay_buffers/prioritized_episode_replay_buffer.py +++ b/rllib/utils/replay_buffers/prioritized_episode_replay_buffer.py @@ -217,6 +217,7 @@ def add( # TODO (sven, simon): Should we just treat such an episode chunk # as a new episode? if eps_evicted_ids[-1] in new_episode_ids: + # TODO (simon): Apply the same logic as in the MA-case. len_to_subtract = len( episodes[new_episode_ids.index(eps_evicted_idxs[-1])] ) diff --git a/rllib/utils/replay_buffers/tests/test_multi_agent_episode_replay_buffer.py b/rllib/utils/replay_buffers/tests/test_multi_agent_episode_replay_buffer.py index 14a3860c5e6c..3844b5a485c6 100644 --- a/rllib/utils/replay_buffers/tests/test_multi_agent_episode_replay_buffer.py +++ b/rllib/utils/replay_buffers/tests/test_multi_agent_episode_replay_buffer.py @@ -5,6 +5,7 @@ from ray.rllib.utils.replay_buffers.multi_agent_episode_replay_buffer import ( MultiAgentEpisodeReplayBuffer, ) +from ray.rllib.utils.test_utils import check class TestMultiAgentEpisodeReplayBuffer(unittest.TestCase): @@ -150,59 +151,58 @@ def test_buffer_independent_sample_logic(self): for i in range(1000): sample = buffer.sample(batch_size_B=16, n_step=1) self.assertTrue(buffer.get_sampled_timesteps() == 16 * (i + 1)) - self.assertTrue("module_1" in sample) - self.assertTrue("module_2" in sample) - for module_id in sample: - self.assertTrue(buffer.get_sampled_timesteps(module_id) == 16 * (i + 1)) + module_ids = {eps.module_id for eps in sample} + self.assertTrue("module_1" in module_ids) + self.assertTrue("module_2" in module_ids) + for eps in sample: + # For both modules, we should have 16 x (i + 1) timesteps sampled. + # Note, this must be the same here as the number of timesteps sampled + # altogether, b/c we sample both modules. + self.assertTrue( + buffer.get_sampled_timesteps("module_1") == 16 * (i + 1) + ) + self.assertTrue( + buffer.get_sampled_timesteps("module_2") == 16 * (i + 1) + ) ( obs, - actions, - rewards, + action, + reward, next_obs, is_terminated, is_truncated, - weights, - n_steps, + weight, + n_step, ) = ( - sample[module_id]["obs"], - sample[module_id]["actions"], - sample[module_id]["rewards"], - sample[module_id]["new_obs"], - sample[module_id]["terminateds"], - sample[module_id]["truncateds"], - sample[module_id]["weights"], - sample[module_id]["n_step"], + eps.get_observations(0), + eps.get_actions(-1), + eps.get_rewards(-1), + eps.get_observations(-1), + eps.is_terminated, + eps.is_truncated, + eps.get_extra_model_outputs("weights", -1), + eps.get_extra_model_outputs("n_step", -1), ) # Make sure terminated and truncated are never both True. - assert not np.any(np.logical_and(is_truncated, is_terminated)) - - # All fields have same shape. - assert ( - obs.shape[:2] - == rewards.shape - == actions.shape - == next_obs.shape - == is_truncated.shape - == is_terminated.shape - ) + assert not (is_truncated and is_terminated) # Note, floating point numbers cannot be compared directly. tolerance = 1e-8 # Assert that actions correspond to the observations. - self.assertTrue(np.all(actions - obs < tolerance)) + check(obs, action, atol=tolerance) # Assert that next observations are correctly one step after # observations. - self.assertTrue(np.all(next_obs - obs - 1 < tolerance)) + check(next_obs, obs + 1, atol=tolerance) # Assert that the reward comes from the next observation. - self.assertTrue(np.all(rewards * 10 - next_obs < tolerance)) + check(reward * 10, next_obs, atol=tolerance) # Furthermore, assert that the importance sampling weights are # one for `beta=0.0`. - self.assertTrue(np.all(weights - 1.0 < tolerance)) + check(weight, 1.0, atol=tolerance) # Assert that all n-steps are 1.0 as passed into `sample`. - self.assertTrue(np.all(n_steps - 1.0 < tolerance)) + check(n_step, 1.0, atol=tolerance) def test_buffer_synchronized_sample_logic(self): """Samples synchronized from the multi-agent buffer.""" diff --git a/rllib/utils/replay_buffers/utils.py b/rllib/utils/replay_buffers/utils.py index 3b1bb6b6924f..c825c64fc3ff 100644 --- a/rllib/utils/replay_buffers/utils.py +++ b/rllib/utils/replay_buffers/utils.py @@ -8,7 +8,7 @@ from ray.rllib.utils.annotations import OldAPIStack from ray.rllib.utils.deprecation import DEPRECATED_VALUE from ray.rllib.utils.from_config import from_config -from ray.rllib.utils.metrics import ALL_MODULES +from ray.rllib.utils.metrics import ALL_MODULES, TD_ERROR_KEY from ray.rllib.utils.metrics.learner_info import LEARNER_STATS_KEY from ray.rllib.utils.replay_buffers import ( EpisodeReplayBuffer, @@ -30,9 +30,6 @@ logger = logging.getLogger(__name__) -# TODO (simon): Move all regular keys to the metric constants file. -TD_ERROR_KEY = "td_error" - @DeveloperAPI def update_priorities_in_episode_replay_buffer( From 5cb7c09af5dc5e6a225526b020ee4b010d5b2748 Mon Sep 17 00:00:00 2001 From: simonsays1980 Date: Fri, 24 May 2024 16:13:23 +0200 Subject: [PATCH 10/65] [RLlib] Cleanup examples folder: Add example restoring 1 of n agents from a checkpoint. (#45462) --- rllib/BUILD | 18 +- .../restore_1_of_n_agents_from_checkpoint.py | 257 +++++++++--------- 2 files changed, 144 insertions(+), 131 deletions(-) diff --git a/rllib/BUILD b/rllib/BUILD index 5af156d68c77..097ad26ca80c 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -2157,15 +2157,6 @@ py_test( srcs = ["examples/checkpoints/onnx_torch.py"], ) -#@OldAPIStack -py_test( - name = "examples/checkpoints/restore_1_of_n_agents_from_checkpoint", - tags = ["team:rllib", "exclusive", "examples"], - size = "medium", - srcs = ["examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py"], - args = ["--pre-training-iters=1", "--stop-iters=1", "--num-cpus=4"] -) - # subdirectory: connectors/ # .................................... # Framestacking examples only run in smoke-test mode (a few iters only). @@ -2751,6 +2742,15 @@ py_test( # args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--framework=torch", "--stop-reward=-100.0", "--num-cpus=4"], # ) +py_test( + name = "examples/checkpoints/restore_1_of_n_agents_from_checkpoint", + main = "examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py", + tags = ["team:rllib", "exclusive", "examples", "examples_use_all_core", "no_main"], + size = "large", + srcs = ["examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py"], + args = ["--enable-new-api-stack", "--num-agents=2", "--framework=torch", "--checkpoint-freq=20", "--checkpoint-at-end", "--num-cpus=4", "--algo=PPO"] +) + py_test( name = "examples/multi_agent/rock_paper_scissors_heuristic_vs_learned", main = "examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py", diff --git a/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py b/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py index 9c4dc3805613..4338791c71fa 100644 --- a/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py +++ b/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py @@ -1,140 +1,153 @@ -# TODO (sven): Move this example script into the new API stack. - -"""Simple example of how to restore only one of n agents from a trained -multi-agent Algorithm using Ray tune. - -Control the number of agents and policies via --num-agents and --num-policies. +"""An example script showing how to load module weights for 1 of n agents +from checkpoint. + +This example: + - Runs a multi-agent `Pendulum-v1` experiment with >= 2 policies. + - Saves a checkpoint of the `MultiAgentRLModule` used every `--checkpoint-freq` + iterations. + - Stops the experiments after the agents reach a combined return of `-800`. + - Picks the best checkpoint by combined return and restores policy 0 from it. + - Runs a second experiment with the restored `RLModule` for policy 0 and + a fresh `RLModule` for the other policies. + - Stops the second experiment after the agents reach a combined return of `-800`. + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-agents=2 +--checkpoint-freq=20 --checkpoint-at-end` + +Control the number of agents and policies (RLModules) via --num-agents and +--num-policies. + +Control the number of checkpoints by setting `--checkpoint-freq` to a value > 0. +Note that the checkpoint frequency is per iteration and this example needs at +least a single checkpoint to load the RLModule weights for policy 0. +If `--checkpoint-at-end` is set, a checkpoint will be saved at the end of the +experiment. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + +Results to expect +----------------- +You should expect a reward of -400.0 eventually being achieved by a simple +single PPO policy (no tuning, just using RLlib's default settings). In the +second run of the experiment, the MARL module weights for policy 0 are +restored from the checkpoint of the first run. The reward for a single agent +should be -400.0 again, but the training time should be shorter (around 30 +iterations instead of 190). """ -import argparse -import gymnasium as gym import os -import random - -import ray -from ray import air, tune from ray.air.constants import TRAINING_ITERATION -from ray.rllib.algorithms.algorithm import Algorithm -from ray.rllib.algorithms.callbacks import DefaultCallbacks -from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole -from ray.rllib.policy.policy import Policy -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.metrics import ( - ENV_RUNNER_RESULTS, - EPISODE_RETURN_MEAN, - NUM_ENV_STEPS_SAMPLED_LIFETIME, +from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec +from ray.rllib.examples.envs.classes.multi_agent import MultiAgentPendulum +from ray.rllib.utils.metrics import ENV_RUNNER_RESULTS, NUM_ENV_STEPS_SAMPLED_LIFETIME +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, ) -from ray.rllib.utils.test_utils import check_learning_achieved +from ray.tune.registry import get_trainable_cls, register_env -tf1, tf, tfv = try_import_tf() - -parser = argparse.ArgumentParser() - -parser.add_argument("--num-agents", type=int, default=4) -parser.add_argument("--num-policies", type=int, default=2) -parser.add_argument("--pre-training-iters", type=int, default=5) -parser.add_argument("--num-cpus", type=int, default=0) -parser.add_argument( - "--framework", - choices=["tf", "tf2", "torch"], - default="torch", - help="The DL framework specifier.", -) -parser.add_argument( - "--as-test", - action="store_true", - help="Whether this script should be run as a test: --stop-reward must " - "be achieved within --stop-timesteps AND --stop-iters.", -) -parser.add_argument( - "--stop-iters", type=int, default=200, help="Number of iterations to train." -) -parser.add_argument( - "--stop-timesteps", type=int, default=100000, help="Number of timesteps to train." -) -parser.add_argument( - "--stop-reward", type=float, default=150.0, help="Reward at which we stop training." +parser = add_rllib_example_script_args( + default_iters=200, + default_timesteps=100000, + default_reward=-400.0, ) +# TODO (sven): This arg is currently ignored (hard-set to 2). +parser.add_argument("--num-policies", type=int, default=2) + if __name__ == "__main__": args = parser.parse_args() - ray.init(num_cpus=args.num_cpus or None) - - # Get obs- and action Spaces. - single_env = gym.make("CartPole-v1") - obs_space = single_env.observation_space - act_space = single_env.action_space - - # Setup PPO with an ensemble of `num_policies` different policies. - policies = { - f"policy_{i}": (None, obs_space, act_space, None) - for i in range(args.num_policies) - } - policy_ids = list(policies.keys()) - - def policy_mapping_fn(agent_id, episode, worker, **kwargs): - pol_id = random.choice(policy_ids) - return pol_id - - config = ( - PPOConfig() - .environment(MultiAgentCartPole, env_config={"num_agents": args.num_agents}) - .framework(args.framework) - .training(num_sgd_iter=10) - .multi_agent(policies=policies, policy_mapping_fn=policy_mapping_fn) - # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. - .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) + # Register our environment with tune. + if args.num_agents > 1: + register_env( + "env", + lambda _: MultiAgentPendulum(config={"num_agents": args.num_agents}), + ) + else: + raise ValueError( + f"`num_agents` must be > 1, but is {args.num_agents}." + "Read the script docstring for more information." + ) + + assert args.checkpoint_freq > 0, ( + "This example requires at least one checkpoint to load the RLModule " + "weights for policy 0." ) - # Do some training and store the checkpoint. - results = tune.Tuner( - "PPO", - param_space=config.to_dict(), - run_config=air.RunConfig( - stop={TRAINING_ITERATION: args.pre_training_iters}, - verbose=1, - checkpoint_config=air.CheckpointConfig( - checkpoint_frequency=1, checkpoint_at_end=True - ), - ), - ).fit() - print("Pre-training done.") - - best_checkpoint = results.get_best_result().checkpoint - print(f".. best checkpoint was: {best_checkpoint}") - - policy_0_checkpoint = os.path.join( - best_checkpoint.to_directory(), "policies/policy_0" + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment("env") + .training( + train_batch_size_per_learner=512, + mini_batch_size_per_learner=64, + lambda_=0.1, + gamma=0.95, + lr=0.0003, + vf_clip_param=10.0, + ) + .rl_module( + model_config_dict={"fcnet_activation": "relu"}, + ) ) - restored_policy_0 = Policy.from_checkpoint(policy_0_checkpoint) - restored_policy_0_weights = restored_policy_0.get_weights() - print("Starting new tune.Tuner().fit()") - # Start our actual experiment. - stop = { - f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, - NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, - TRAINING_ITERATION: args.stop_iters, + # Add a simple multi-agent setup. + if args.num_agents > 0: + base_config.multi_agent( + policies={f"p{i}" for i in range(args.num_agents)}, + policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", + ) + + # Augment the base config with further settings and train the agents. + results = run_rllib_example_script_experiment(base_config, args) + + # Create an env instance to get the observation and action spaces. + env = MultiAgentPendulum(config={"num_agents": args.num_agents}) + # Get the default module spec from the algorithm config. + module_spec = base_config.get_default_rl_module_spec() + module_spec.model_config_dict = base_config.model_config | { + "fcnet_activation": "relu", } - - class RestoreWeightsCallback(DefaultCallbacks): - def on_algorithm_init(self, *, algorithm: "Algorithm", **kwargs) -> None: - algorithm.set_weights({"policy_0": restored_policy_0_weights}) - - # Make sure, the non-1st policies are not updated anymore. - config.policies_to_train = [pid for pid in policy_ids if pid != "policy_0"] - config.callbacks(RestoreWeightsCallback) - - results = tune.run( - "PPO", - stop=stop, - config=config.to_dict(), - verbose=1, + module_spec.observation_space = env.envs[0].observation_space + module_spec.action_space = env.envs[0].action_space + # Create the module for each policy, but policy 0. + module_specs = {} + for i in range(1, args.num_agents or 1): + module_specs[f"p{i}"] = module_spec + + # Now swap in the RLModule weights for policy 0. + chkpt_path = results.get_best_result().checkpoint.path + p_0_module_state_path = os.path.join(chkpt_path, "learner", "module_state", "p0") + module_spec.load_state_path = p_0_module_state_path + module_specs["p0"] = module_spec + + # Create the MARL module. + marl_module_spec = MultiAgentRLModuleSpec(module_specs=module_specs) + # Define the MARL module in the base config. + base_config.rl_module(rl_module_spec=marl_module_spec) + # We need to re-register the environment when starting a new run. + register_env( + "env", + lambda _: MultiAgentPendulum(config={"num_agents": args.num_agents}), ) + # Define stopping criteria. + stop = { + # TODO (simon): Change to -800 once the metrics are fixed. Currently + # the combined return is not correctly computed. + f"{ENV_RUNNER_RESULTS}/episode_return_mean": -400, + f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": 20000, + TRAINING_ITERATION: 30, + } - if args.as_test: - check_learning_achieved(results, args.stop_reward) - - ray.shutdown() + # Run the experiment again with the restored MARL module. + run_rllib_example_script_experiment(base_config, args, stop=stop) From fde52033983ee529ac6ef3d1265086bd45bd23c9 Mon Sep 17 00:00:00 2001 From: simonsays1980 Date: Fri, 24 May 2024 18:00:47 +0200 Subject: [PATCH 11/65] [RLlib] Example Cleanups new API Stack - Autoregressive Action Module. (#45525) --- .../envs/classes/correlated_actions_env.py | 13 +- .../rl_modules/autoregressive_actions_rlm.py | 107 +++++++ .../classes/autoregressive_actions_rlm.py | 295 ++++++++++++++++++ rllib/models/torch/torch_distributions.py | 9 +- 4 files changed, 416 insertions(+), 8 deletions(-) create mode 100644 rllib/examples/rl_modules/autoregressive_actions_rlm.py create mode 100644 rllib/examples/rl_modules/classes/autoregressive_actions_rlm.py diff --git a/rllib/examples/envs/classes/correlated_actions_env.py b/rllib/examples/envs/classes/correlated_actions_env.py index 055a4d75d558..a3db0d7556d9 100644 --- a/rllib/examples/envs/classes/correlated_actions_env.py +++ b/rllib/examples/envs/classes/correlated_actions_env.py @@ -1,5 +1,6 @@ import gymnasium as gym -from gymnasium.spaces import Discrete, Tuple +from gymnasium.spaces import Box, Discrete, Tuple +import numpy as np import random @@ -13,19 +14,19 @@ class CorrelatedActionsEnv(gym.Env): to a1. I.e., +10 at most per step. One way to effectively learn this is through correlated action - distributions, e.g., in examples/autoregressive_action_dist.py + distributions, e.g., in examples/rl_modules/autoregressive_action_rlm.py There are 20 steps. Hence, the best score would be ~200 reward. """ - def __init__(self, _): - self.observation_space = Discrete(2) + def __init__(self, _=None): + self.observation_space = Box(0, 1, shape=(1,), dtype=np.float32) self.action_space = Tuple([Discrete(2), Discrete(2)]) self.last_observation = None def reset(self, *, seed=None, options=None): self.t = 0 - self.last_observation = random.choice([0, 1]) + self.last_observation = np.array([random.choice([0, 1])], dtype=np.float32) return self.last_observation, {} def step(self, action): @@ -39,5 +40,5 @@ def step(self, action): if a1 == a2: reward += 5 done = truncated = self.t > 20 - self.last_observation = random.choice([0, 1]) + self.last_observation = np.array([random.choice([0, 1])], dtype=np.float32) return self.last_observation, reward, done, truncated, {} diff --git a/rllib/examples/rl_modules/autoregressive_actions_rlm.py b/rllib/examples/rl_modules/autoregressive_actions_rlm.py new file mode 100644 index 000000000000..7920ca622738 --- /dev/null +++ b/rllib/examples/rl_modules/autoregressive_actions_rlm.py @@ -0,0 +1,107 @@ +"""An example script showing how to define and load an `RLModule` with +a dependent action space. + +This examples: + - Defines an `RLModule` with autoregressive actions. + - It does so by implementing a prior distribution for the first couple + of actions and then using these actions in a posterior distribution. + - Furthermore, it uses in the `RLModule` our simple base `Catalog` class + to build the distributions. + - Uses this `RLModule` in a PPO training run on a simple environment + that rewards synchronized actions. + - Stops the training after 100k steps or when the mean episode return + exceeds 150 in evaluation, i.e. if the agent has learned to + synchronize its actions. + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-env-runners 2` + +Control the number of `EnvRunner`s with the `--num-env-runners` flag. This +will increase the sampling speed. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + +Results to expect +----------------- +You should expect a reward of around 155-160 after ~36,000 timesteps sampled +(trained) being achieved by a simple PPO policy (no tuning, just using RLlib's +default settings). For details take also a closer look into the +`CorrelatedActionsEnv` environment. Rewards are such that to receive a return +over 100, the agent must learn to synchronize its actions. +""" + + +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.core.models.catalog import Catalog +from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec +from ray.rllib.examples.envs.classes.correlated_actions_env import CorrelatedActionsEnv +from ray.rllib.examples.rl_modules.classes.autoregressive_actions_rlm import ( + AutoregressiveActionTorchRLM, +) +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + EVALUATION_RESULTS, + NUM_ENV_STEPS_SAMPLED_LIFETIME, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune import register_env + + +register_env("correlated_actions_env", lambda _: CorrelatedActionsEnv(_)) + +parser = add_rllib_example_script_args( + default_iters=200, + default_timesteps=100000, + default_reward=150.0, +) + +if __name__ == "__main__": + args = parser.parse_args() + + if args.algo != "PPO": + raise ValueError("This example only supports PPO. Please use --algo=PPO.") + + base_config = ( + PPOConfig() + .environment(env="correlated_actions_env") + .rl_module( + model_config_dict={ + "post_fcnet_hiddens": [64, 64], + "post_fcnet_activation": "relu", + }, + # We need to explicitly specify here RLModule to use and + # the catalog needed to build it. + rl_module_spec=SingleAgentRLModuleSpec( + module_class=AutoregressiveActionTorchRLM, + catalog_class=Catalog, + ), + ) + .evaluation( + evaluation_num_env_runners=1, + evaluation_interval=1, + # Run evaluation parallel to training to speed up the example. + evaluation_parallel_to_training=True, + ) + ) + + # Let's stop the training after 100k steps or when the mean episode return + # exceeds 150 in evaluation. + stop = { + f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": 100000, + f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 150.0, + } + + # Run the example (with Tune). + run_rllib_example_script_experiment(base_config, args, stop=stop) diff --git a/rllib/examples/rl_modules/classes/autoregressive_actions_rlm.py b/rllib/examples/rl_modules/classes/autoregressive_actions_rlm.py new file mode 100644 index 000000000000..fa773944976b --- /dev/null +++ b/rllib/examples/rl_modules/classes/autoregressive_actions_rlm.py @@ -0,0 +1,295 @@ +from abc import abstractmethod +from typing import Any, Dict, Type + +from ray.rllib.core import Columns +from ray.rllib.core.models.base import ENCODER_OUT +from ray.rllib.core.models.configs import MLPHeadConfig +from ray.rllib.core.models.specs.specs_dict import SpecDict +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.core.rl_module.torch.torch_rl_module import TorchRLModule +from ray.rllib.models.distributions import Distribution +from ray.rllib.utils.annotations import ( + override, + OverrideToImplementCustomLogic_CallToSuperRecommended, +) +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.torch_utils import convert_to_torch_tensor +from ray.rllib.utils.typing import TensorType + +torch, nn = try_import_torch() + + +# TODO (simon): Improvements: `inference-only` mode. +class AutoregressiveActionRLM(RLModule): + """An RLModule that implements an autoregressive action distribution. + + This RLModule implements an autoregressive action distribution, where the + action is sampled in two steps. First, the prior action is sampled from a + prior distribution. Then, the posterior action is sampled from a posterior + distribution that depends on the prior action and the input data. The prior + and posterior distributions are implemented as MLPs. + + The following components are implemented: + - ENCODER: An encoder that processes the observations from the environment. + - PI: A Policy head that outputs the actions, the log probabilities of the + actions, and the input to the action distribution. This head is composed + of two sub-heads: + - A prior head that outputs the logits for the prior action distribution. + - A posterior head that outputs the logits for the posterior action + distribution. + - A value function head that outputs the value function. + + Note, this RLModule is implemented for the `PPO` algorithm only. It is not + guaranteed to work with other algorithms. + """ + + @override(RLModule) + @OverrideToImplementCustomLogic_CallToSuperRecommended + def setup(self): + super().setup() + + # Build the encoder. + self.encoder = self.config.get_catalog().build_encoder(framework=self.framework) + + # Build the prior and posterior heads. + # Note, the action space is a Tuple space. + self.action_dist_cls = self.config.get_catalog().get_action_dist_cls( + self.framework + ) + # Note further, we neet to know the required input dimensions for + # the partial distributions. + self.required_output_dims = self.action_dist_cls.required_input_dim( + space=self.config.action_space, + as_list=True, + ) + action_dims = self.config.action_space[0].shape or (1,) + latent_dims = self.config.get_catalog().latent_dims + prior_config = MLPHeadConfig( + # Use the hidden dimension from the encoder output. + input_dims=latent_dims, + # Use configurations from the `model_config_dict`. + hidden_layer_dims=self.config.model_config_dict["post_fcnet_hiddens"], + hidden_layer_activation=self.config.model_config_dict[ + "post_fcnet_activation" + ], + output_layer_dim=self.required_output_dims[0], + output_layer_activation="linear", + ) + # Build the posterior head. + posterior_config = MLPHeadConfig( + input_dims=(latent_dims[0] + action_dims[0],), + hidden_layer_dims=self.config.model_config_dict["post_fcnet_hiddens"], + hidden_layer_activation=self.config.model_config_dict[ + "post_fcnet_activation" + ], + output_layer_dim=self.required_output_dims[1], + output_layer_activation="linear", + ) + + self.prior = prior_config.build(framework=self.framework) + self.posterior = posterior_config.build(framework=self.framework) + + # Build the value function head. + vf_config = MLPHeadConfig( + input_dims=latent_dims, + hidden_layer_dims=self.config.model_config_dict["post_fcnet_hiddens"], + hidden_layer_activation=self.config.model_config_dict[ + "post_fcnet_activation" + ], + output_layer_dim=1, + output_layer_activation="linear", + ) + self.vf = vf_config.build(framework=self.framework) + + @override(RLModule) + def get_train_action_dist_cls(self) -> Type[Distribution]: + return self.action_dist_cls + + @override(RLModule) + def get_exploration_action_dist_cls(self) -> Type[Distribution]: + return self.action_dist_cls + + @override(RLModule) + def get_inference_action_dist_cls(self) -> Type[Distribution]: + return self.action_dist_cls + + @override(RLModule) + def output_specs_inference(self) -> SpecDict: + return [Columns.ACTIONS] + + @override(RLModule) + def output_specs_exploration(self) -> SpecDict: + return [Columns.ACTION_DIST_INPUTS, Columns.ACTIONS, Columns.ACTION_LOGP] + + @override(RLModule) + def output_specs_train(self) -> SpecDict: + return [ + Columns.ACTION_DIST_INPUTS, + Columns.ACTIONS, + Columns.ACTION_LOGP, + Columns.VF_PREDS, + ] + + @abstractmethod + def pi(self, batch: Dict[str, TensorType]) -> Dict[str, TensorType]: + """Computes the policy outputs given a batch of data. + + Args: + batch: The input batch to pass through the policy head. + + Returns: + A dict mapping Column names to batches of policy outputs. + """ + + @abstractmethod + def _compute_values(self, batch) -> Any: + """Computes values using the vf-specific network(s) and given a batch of data. + + Args: + batch: The input batch to pass through this RLModule (value function + encoder and vf-head). + + Returns: + A dict mapping ModuleIDs to batches of value function outputs (already + squeezed on the last dimension (which should have shape (1,) b/c of the + single value output node). However, for complex multi-agent settings with + shareed value networks, the output might look differently (e.g. a single + return batch without the ModuleID-based mapping). + """ + + +class AutoregressiveActionTorchRLM(TorchRLModule, AutoregressiveActionRLM): + @override(AutoregressiveActionRLM) + def pi( + self, batch: Dict[str, TensorType], inference: bool = False + ) -> Dict[str, TensorType]: + pi_outs = {} + + # Prior forward pass. + prior_out = self.prior(batch) + prior_logits = torch.cat( + [ + prior_out, + # We add zeros for the posterior logits, which we do not have at + # this point of time. + torch.zeros(size=(prior_out.shape[0], self.required_output_dims[1])), + ], + dim=-1, + ) + # Get the prior action distribution to sample the prior action. + if inference: + # If in inference mode, we need to set the distribution to be deterministic. + prior_action_dist = self.action_dist_cls.from_logits( + prior_logits + ).to_deterministic() + # If in inference mode, we can sample in a simple way. + prior_action = prior_action_dist._flat_child_distributions[0].sample() + else: + prior_action_dist = self.action_dist_cls.from_logits(prior_logits) + # Note, `TorchMultiDistribution.from_logits` does set the `logits`, but not + # the `probs` attribute. We need to set the `probs` attribute to be able to + # sample from the distribution in a differentiable way. + prior_action_dist._flat_child_distributions[0].probs = torch.softmax( + prior_out, dim=-1 + ) + prior_action_dist._flat_child_distributions[0].logits = None + # Otherwise, we need to be able to backpropagate through the prior action + # that's why we sample from the distribution using the `rsample` method. + # TODO (simon, sven): Check, if we need to return the one-hot sampled action + # instead of the real-valued one. + prior_action = torch.argmax( + prior_action_dist._flat_child_distributions[0].rsample(), + dim=-1, + ) + + # Posterior forward pass. + posterior_batch = torch.cat([batch, prior_action.view(-1, 1)], dim=-1) + posterior_out = self.posterior(posterior_batch) + # Concatenate the prior and posterior logits to get the final logits. + posterior_logits = torch.cat([prior_out, posterior_out], dim=-1) + if inference: + posterior_action_dist = self.action_dist_cls.from_logits( + posterior_logits + ).to_deterministic() + # Sample the posterior action. + posterior_action = posterior_action_dist._flat_child_distributions[ + 1 + ].sample() + + else: + # Get the posterior action distribution to sample the posterior action. + posterior_action_dist = self.action_dist_cls.from_logits(posterior_logits) + # Sample the posterior action. + posterior_action = posterior_action_dist._flat_child_distributions[ + 1 + ].sample() + + # We need the log probabilities of the sampled actions for the loss + # calculation. + prior_action_logp = prior_action_dist._flat_child_distributions[0].logp( + prior_action + ) + posterior_action_logp = posterior_action_dist._flat_child_distributions[ + 1 + ].logp(posterior_action) + pi_outs[Columns.ACTION_LOGP] = prior_action_logp + posterior_action_logp + # We also need the input to the action distribution to calculate the + # KL-divergence. + pi_outs[Columns.ACTION_DIST_INPUTS] = posterior_logits + + # Concatenate the prior and posterior actions and log probabilities. + pi_outs[Columns.ACTIONS] = (prior_action, posterior_action) + + return pi_outs + + @override(TorchRLModule) + def _forward_inference(self, batch: Dict[str, TensorType]) -> Dict[str, TensorType]: + + # Encoder forward pass. + encoder_out = self.encoder(batch) + + # Policy head forward pass. + return self.pi(encoder_out[ENCODER_OUT], inference=True) + + @override(TorchRLModule) + def _forward_exploration( + self, batch: Dict[str, TensorType], **kwargs + ) -> Dict[str, TensorType]: + # Encoder forward pass. + encoder_out = self.encoder(batch) + + # Policy head forward pass. + return self.pi(encoder_out[ENCODER_OUT], inference=False) + + @override(TorchRLModule) + def _forward_train(self, batch: Dict[str, TensorType]) -> Dict[str, TensorType]: + + outs = {} + + # Encoder forward pass. + encoder_out = self.encoder(batch) + + # Policy head forward pass. + outs.update(self.pi(encoder_out[ENCODER_OUT])) + + # Value function head forward pass. + vf_out = self.vf(encoder_out[ENCODER_OUT]) + outs[Columns.VF_PREDS] = vf_out.squeeze(-1) + + return outs + + @override(AutoregressiveActionRLM) + def _compute_values(self, batch, device=None): + infos = batch.pop(Columns.INFOS, None) + batch = convert_to_torch_tensor(batch, device=device) + if infos is not None: + batch[Columns.INFOS] = infos + + # Encoder forward pass. + encoder_outs = self.encoder(batch)[ENCODER_OUT] + + # Value head forward pass. + vf_out = self.vf(encoder_outs) + + # Squeeze out last dimension (single node value head). + return vf_out.squeeze(-1) diff --git a/rllib/models/torch/torch_distributions.py b/rllib/models/torch/torch_distributions.py index d9f42c0ec473..d5f3bccc220c 100644 --- a/rllib/models/torch/torch_distributions.py +++ b/rllib/models/torch/torch_distributions.py @@ -600,8 +600,13 @@ def sample(self): @staticmethod @override(Distribution) - def required_input_dim(space: gym.Space, input_lens: List[int], **kwargs) -> int: - return sum(input_lens) + def required_input_dim( + space: gym.Space, input_lens: List[int], as_list: bool = False, **kwargs + ) -> int: + if as_list: + return input_lens + else: + return sum(input_lens) @classmethod @override(Distribution) From 7709dec241e2ad8eed487bc6afc29e5be87ae387 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen <128072568+can-anyscale@users.noreply.github.com> Date: Fri, 24 May 2024 13:17:44 -0700 Subject: [PATCH 12/65] [ci][microcheck/11] move get_changed_tests to Test class (#45499) This is part of the series to move some of the function to compute microcheck test coverage to the Test class. This will allow me to reuse the logic more easier in other places. Should be a pure refactoring. Test: - CI Signed-off-by: can --- ci/ray_ci/test_tester.py | 30 +----------- ci/ray_ci/tester.py | 61 +---------------------- release/ray_release/test.py | 67 +++++++++++++++++++++++++- release/ray_release/tests/test_test.py | 24 +++++++++ 4 files changed, 93 insertions(+), 89 deletions(-) diff --git a/ci/ray_ci/test_tester.py b/ci/ray_ci/test_tester.py index 46cdf0e178df..0b8819d8b6de 100644 --- a/ci/ray_ci/test_tester.py +++ b/ci/ray_ci/test_tester.py @@ -17,8 +17,6 @@ _get_flaky_test_targets, _get_tag_matcher, _get_new_tests, - _get_changed_files, - _get_changed_tests, _get_human_specified_tests, ) from ray_release.test import Test, TestState @@ -128,7 +126,7 @@ def test_get_test_targets() -> None: "ray_release.test.Test.gen_high_impact_tests", return_value={"step": test_objects}, ), mock.patch( - "ci.ray_ci.tester._get_changed_tests", + "ray_release.test.Test.get_changed_tests", return_value=set(), ), mock.patch( "ci.ray_ci.tester._get_new_tests", @@ -256,7 +254,7 @@ def test_get_high_impact_test_targets() -> None: "ci.ray_ci.tester._get_new_tests", return_value=test["new_tests"], ), mock.patch( - "ci.ray_ci.tester._get_changed_tests", + "ray_release.test.Test.get_changed_tests", return_value=test["changed_tests"], ), mock.patch( "ci.ray_ci.tester._get_human_specified_tests", @@ -285,30 +283,6 @@ def test_get_new_tests(mock_gen_from_s3, mock_run_script_with_output) -> None: ) == {"//new_test"} -@mock.patch.dict( - os.environ, - {"BUILDKITE_PULL_REQUEST_BASE_BRANCH": "base", "BUILDKITE_COMMIT": "commit"}, -) -@mock.patch("subprocess.check_call") -@mock.patch("subprocess.check_output") -def test_get_changed_files(mock_check_output, mock_check_call) -> None: - mock_check_output.return_value = b"file1\nfile2\n" - assert _get_changed_files() == {"file1", "file2"} - - -@mock.patch("ci.ray_ci.tester._get_test_targets_per_file") -@mock.patch("ci.ray_ci.tester._get_changed_files") -def test_get_changed_tests( - mock_get_changed_files, mock_get_test_targets_per_file -) -> None: - mock_get_changed_files.return_value = {"test_src", "build_src"} - mock_get_test_targets_per_file.side_effect = ( - lambda x: {"//t1", "//t2"} if x == "test_src" else {} - ) - - assert _get_changed_tests() == {"//t1", "//t2"} - - @mock.patch.dict( os.environ, {"BUILDKITE_PULL_REQUEST_BASE_BRANCH": "base", "BUILDKITE_COMMIT": "commit"}, diff --git a/ci/ray_ci/tester.py b/ci/ray_ci/tester.py index e94117c3f8f8..14f92a4f0918 100644 --- a/ci/ray_ci/tester.py +++ b/ci/ray_ci/tester.py @@ -417,7 +417,7 @@ def _get_high_impact_test_targets( if test.get_oncall() == team } new_tests = _get_new_tests(os_prefix, container) - changed_tests = _get_changed_tests() + changed_tests = Test.get_changed_tests(bazel_workspace_dir) human_specified_tests = _get_human_specified_tests() return ( @@ -464,65 +464,6 @@ def _get_new_tests(prefix: str, container: TesterContainer) -> Set[str]: return local_test_targets.difference(db_test_targets) -def _get_changed_tests() -> Set[str]: - """ - Get all changed tests in the current PR - """ - changed_files = _get_changed_files() - logger.info(f"Changed files: {changed_files}") - return set( - itertools.chain.from_iterable( - [_get_test_targets_per_file(file) for file in _get_changed_files()] - ) - ) - - -def _get_test_targets_per_file(file: str) -> Set[str]: - """ - Get the test target from a file path - """ - try: - package = ( - subprocess.check_output(["bazel", "query", file], cwd=bazel_workspace_dir) - .decode() - .strip() - ) - if not package: - return set() - targets = subprocess.check_output( - ["bazel", "query", f"tests(attr('srcs', {package}, //...))"], - cwd=bazel_workspace_dir, - ) - targets = { - target.strip() - for target in targets.decode().splitlines() - if target is not None - } - logger.info(f"Found test targets for file {file}: {targets}") - - return targets - except subprocess.CalledProcessError: - logger.info(f"File {file} is not a test target") - return set() - - -def _get_changed_files() -> Set[str]: - """ - Get all changed files in the current PR - """ - base = os.environ.get("BUILDKITE_PULL_REQUEST_BASE_BRANCH") - head = os.environ.get("BUILDKITE_COMMIT") - if not base or not head: - # if not in a PR, return an empty set - return set() - - changes = subprocess.check_output( - ["git", "diff", "--name-only", f"origin/{base}...{head}"], - cwd=bazel_workspace_dir, - ) - return {file.strip() for file in changes.decode().splitlines() if file is not None} - - def _get_flaky_test_targets( team: str, operating_system: str, yaml_dir: Optional[str] = None ) -> List[str]: diff --git a/release/ray_release/test.py b/release/ray_release/test.py index 868adfac0cb5..b5d5bfef3b9d 100644 --- a/release/ray_release/test.py +++ b/release/ray_release/test.py @@ -3,10 +3,11 @@ import enum import os import platform +import subprocess import json import time from itertools import chain -from typing import Awaitable, Optional, List, Dict +from typing import Awaitable, Optional, List, Dict, Set from dataclasses import dataclass import aioboto3 @@ -211,6 +212,70 @@ def gen_high_impact_tests(cls, prefix: str) -> Dict[str, List]: return step_id_to_tests + @classmethod + def get_changed_tests(cls, bazel_workspace_dir: str) -> Set[str]: + """ + Get all changed tests in the current PR + """ + return set( + chain.from_iterable( + [ + cls._get_test_targets_per_file(file, bazel_workspace_dir) + for file in cls._get_changed_files(bazel_workspace_dir) + ] + ) + ) + + @classmethod + def _get_changed_files(cls, bazel_workspace_dir: str) -> Set[str]: + """ + Get all changed files in the current PR + """ + base = os.environ.get("BUILDKITE_PULL_REQUEST_BASE_BRANCH") + head = os.environ.get("BUILDKITE_COMMIT") + if not base or not head: + # if not in a PR, return an empty set + return set() + + changes = subprocess.check_output( + ["git", "diff", "--name-only", f"origin/{base}...{head}"], + cwd=bazel_workspace_dir, + ) + return { + file.strip() for file in changes.decode().splitlines() if file is not None + } + + @classmethod + def _get_test_targets_per_file( + cls, file: str, bazel_workspace_dir: str + ) -> Set[str]: + """ + Get the test target from a file path + """ + try: + package = ( + subprocess.check_output( + ["bazel", "query", file], cwd=bazel_workspace_dir + ) + .decode() + .strip() + ) + if not package: + return set() + targets = subprocess.check_output( + ["bazel", "query", f"tests(attr('srcs', {package}, //...))"], + cwd=bazel_workspace_dir, + ) + targets = { + target.strip() + for target in targets.decode().splitlines() + if target is not None + } + + return targets + except subprocess.CalledProcessError: + return set() + def is_jailed_with_open_issue(self, ray_github: Repository) -> bool: """ Returns whether this test is jailed with open issue. diff --git a/release/ray_release/tests/test_test.py b/release/ray_release/tests/test_test.py index b9a756b5b3ed..cb9265aeabcd 100644 --- a/release/ray_release/tests/test_test.py +++ b/release/ray_release/tests/test_test.py @@ -381,5 +381,29 @@ def test_get_test_target(): assert Test({"name": input}).get_target() == output +@mock.patch.dict( + os.environ, + {"BUILDKITE_PULL_REQUEST_BASE_BRANCH": "base", "BUILDKITE_COMMIT": "commit"}, +) +@mock.patch("subprocess.check_call") +@mock.patch("subprocess.check_output") +def test_get_changed_files(mock_check_output, mock_check_call) -> None: + mock_check_output.return_value = b"file1\nfile2\n" + assert Test._get_changed_files("") == {"file1", "file2"} + + +@mock.patch("ray_release.test.Test._get_test_targets_per_file") +@mock.patch("ray_release.test.Test._get_changed_files") +def test_get_changed_tests( + mock_get_changed_files, mock_get_test_targets_per_file +) -> None: + mock_get_changed_files.return_value = {"test_src", "build_src"} + mock_get_test_targets_per_file.side_effect = ( + lambda x, _: {"//t1", "//t2"} if x == "test_src" else {} + ) + + assert Test.get_changed_tests("") == {"//t1", "//t2"} + + if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) From 10b255e5fb607d34761448099ebd03af24badf4e Mon Sep 17 00:00:00 2001 From: Cuong Nguyen <128072568+can-anyscale@users.noreply.github.com> Date: Fri, 24 May 2024 13:49:20 -0700 Subject: [PATCH 13/65] Revert "Unpin setproctitle, fixes #37727" (#45539) Reverts ray-project/ray#40289. Breaking https://github.com/orgs/anyscale/projects/76/views/1?pane=issue&itemId=64249695. Close #45502 Test: - https://buildkite.com/ray-project/postmerge-macos/builds/942 --- ci/env/install-dependencies.sh | 2 +- python/setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/env/install-dependencies.sh b/ci/env/install-dependencies.sh index 784800bcc2c7..67e7d05bb826 100755 --- a/ci/env/install-dependencies.sh +++ b/ci/env/install-dependencies.sh @@ -514,7 +514,7 @@ install_thirdparty_packages() { fi mkdir -p "${WORKSPACE_DIR}/python/ray/thirdparty_files" RAY_THIRDPARTY_FILES="$(realpath "${WORKSPACE_DIR}/python/ray/thirdparty_files")" - CC=gcc python -m pip install psutil==5.9.6 "setproctitle>=1.2.2,<1.4" colorama==0.4.6 --target="${RAY_THIRDPARTY_FILES}" + CC=gcc python -m pip install psutil==5.9.6 setproctitle==1.2.2 colorama==0.4.6 --target="${RAY_THIRDPARTY_FILES}" } install_dependencies() { diff --git a/python/setup.py b/python/setup.py index c402875e0f44..8c1d573f5109 100644 --- a/python/setup.py +++ b/python/setup.py @@ -516,7 +516,7 @@ def build(build_python, build_java, build_cpp): # that certain flags will not be passed along such as --user or sudo. # TODO(rkn): Fix this. if not os.getenv("SKIP_THIRDPARTY_INSTALL"): - pip_packages = ["psutil", "setproctitle>=1.2.2,<1.4", "colorama"] + pip_packages = ["psutil", "setproctitle==1.2.2", "colorama"] subprocess.check_call( [ sys.executable, From 21534bbb63ca8d58052337797d8b2f4320807374 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen <128072568+can-anyscale@users.noreply.github.com> Date: Fri, 24 May 2024 14:57:17 -0700 Subject: [PATCH 14/65] [doc] add a doc page about ci testing workflow on a PR (#45446) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As title, add a doc page about the new ci testing workflow (microcheck, go, premerge, etc.). I'll follow up with another PR to add more images to the doc. Test: - https://anyscale-ray--45446.com.readthedocs.build/en/45446/ray-contribute/ci.html Screenshot 2024-05-20 at 11 02 29 AM --------- Signed-off-by: can Signed-off-by: Cuong Nguyen <128072568+can-anyscale@users.noreply.github.com> Co-authored-by: angelinalg <122562471+angelinalg@users.noreply.github.com> --- doc/source/ray-contribute/ci.rst | 52 +++++++++++++++++++ .../ray-contribute/getting-involved.rst | 1 + 2 files changed, 53 insertions(+) create mode 100644 doc/source/ray-contribute/ci.rst diff --git a/doc/source/ray-contribute/ci.rst b/doc/source/ray-contribute/ci.rst new file mode 100644 index 000000000000..ba6981f934e9 --- /dev/null +++ b/doc/source/ray-contribute/ci.rst @@ -0,0 +1,52 @@ +CI Testing Workflow on PRs +========================== + +This guide helps contributors to understand the Continuous Integration (CI) +workflow on a PR. Here CI stands for the automated testing of the codebase +on the PR. + +`microcheck`: default tests on your PR +-------------------------------------- +With every commit on your PR, by default, we'll run a set of tests +called `microcheck`. + +These tests are designed to be 90% accurate at catching bugs on your +PR while running only 10% of the full test suite. As a result, +microcheck typically finishes twice as fast and twice cheaper than +the full test suite. Some of the notable features of microcheck are: + +* If a new test is added or an existing test is modified in a pull + request, microcheck will ensure these tests are included. +* You can manually add more tests to microcheck by including the following line + in the body of your git commit message: + `@microcheck TEST_TARGET01 TEST_TARGET02 ....`. This line must be in the + body of your message, starting from the second line or + below (the first line is the commit message title). For example, here + is how I manually add tests in my pull request:: + + // git command to add commit message + git commit -a -s + + // content of the commit message + run other serve doc tests + + @microcheck //doc:source/serve/doc_code/distilbert //doc:source/serve/doc_code/object_detection //doc:source/serve/doc_code/stable_diffusion + + Signed-off-by: can + +If microcheck passes, you'll see a green checkmark on your PR. If it +fails, you'll see a red cross. In either case, you'll see a summary of +the test run statuses in the github UI. + + +Additional tests at merge time +------------------------------ +In this workflow, to merge your PR, simply click on the Enable auto-merge +button (or ask a committer to do so). This will trigger additional test +cases, and the PR will merge automatically once they finish and pass. + +Alternatively, you can also add a `go` label to manually trigger the full +test suite on your PR (be mindful that this is less recommended but we +understand you know best about the need of your PR). While we anticipate +this will be rarely needed, if you do require it constantly, please let +us know. We are continuously improve the effectiveness of microcheck. diff --git a/doc/source/ray-contribute/getting-involved.rst b/doc/source/ray-contribute/getting-involved.rst index c936b2dc7819..2d4b76b00a29 100644 --- a/doc/source/ray-contribute/getting-involved.rst +++ b/doc/source/ray-contribute/getting-involved.rst @@ -10,6 +10,7 @@ Getting Involved / Contributing :hidden: development + ci docs writing-code-snippets fake-autoscaler From f9ac0505b41fcaaa38f134129d5bc1e7eee0a4e0 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen <128072568+can-anyscale@users.noreply.github.com> Date: Fri, 24 May 2024 15:53:55 -0700 Subject: [PATCH 15/65] [ci][microcheck/12] move get_new_tests and get_human_specified_tests to Test class (#45500) This is part of the series to move some of the function to compute microcheck test coverage to the Test class. This will allow me to reuse the logic more easier in other places. Should be a pure refactoring. Test: - CI --------- Signed-off-by: can --- ci/ray_ci/test_tester.py | 14 +------------ ci/ray_ci/tester.py | 28 ++------------------------ release/ray_release/test.py | 25 +++++++++++++++++++++++ release/ray_release/tests/test_test.py | 11 ++++++++++ 4 files changed, 39 insertions(+), 39 deletions(-) diff --git a/ci/ray_ci/test_tester.py b/ci/ray_ci/test_tester.py index 0b8819d8b6de..86bbee6b859f 100644 --- a/ci/ray_ci/test_tester.py +++ b/ci/ray_ci/test_tester.py @@ -17,7 +17,6 @@ _get_flaky_test_targets, _get_tag_matcher, _get_new_tests, - _get_human_specified_tests, ) from ray_release.test import Test, TestState @@ -257,7 +256,7 @@ def test_get_high_impact_test_targets() -> None: "ray_release.test.Test.get_changed_tests", return_value=test["changed_tests"], ), mock.patch( - "ci.ray_ci.tester._get_human_specified_tests", + "ray_release.test.Test.get_human_specified_tests", return_value=test["human_tests"], ): assert ( @@ -283,17 +282,6 @@ def test_get_new_tests(mock_gen_from_s3, mock_run_script_with_output) -> None: ) == {"//new_test"} -@mock.patch.dict( - os.environ, - {"BUILDKITE_PULL_REQUEST_BASE_BRANCH": "base", "BUILDKITE_COMMIT": "commit"}, -) -@mock.patch("subprocess.check_call") -@mock.patch("subprocess.check_output") -def test_get_human_specified_tests(mock_check_output, mock_check_call) -> None: - mock_check_output.return_value = b"hi\n@microcheck //test01 //test02\nthere" - assert _get_human_specified_tests() == {"//test01", "//test02"} - - def test_get_flaky_test_targets() -> None: test_harness = [ { diff --git a/ci/ray_ci/tester.py b/ci/ray_ci/tester.py index 14f92a4f0918..ec2967689398 100644 --- a/ci/ray_ci/tester.py +++ b/ci/ray_ci/tester.py @@ -1,6 +1,5 @@ import itertools import os -import subprocess import sys from typing import List, Set, Tuple, Optional @@ -18,7 +17,7 @@ from ci.ray_ci.linux_tester_container import LinuxTesterContainer from ci.ray_ci.windows_tester_container import WindowsTesterContainer from ci.ray_ci.tester_container import TesterContainer -from ci.ray_ci.utils import docker_login, ci_init, logger +from ci.ray_ci.utils import docker_login, ci_init from ray_release.test import Test, TestState CUDA_COPYRIGHT = """ @@ -418,7 +417,7 @@ def _get_high_impact_test_targets( } new_tests = _get_new_tests(os_prefix, container) changed_tests = Test.get_changed_tests(bazel_workspace_dir) - human_specified_tests = _get_human_specified_tests() + human_specified_tests = Test.get_human_specified_tests(bazel_workspace_dir) return ( high_impact_tests.union(new_tests) @@ -427,29 +426,6 @@ def _get_high_impact_test_targets( ) -def _get_human_specified_tests() -> Set[str]: - """ - Get all test targets that are specified by humans - """ - base = os.environ.get("BUILDKITE_PULL_REQUEST_BASE_BRANCH") - head = os.environ.get("BUILDKITE_COMMIT") - if not base or not head: - # if not in a PR, return an empty set - return set() - - tests = set() - messages = subprocess.check_output( - ["git", "rev-list", "--format=%b", f"origin/{base}...{head}"], - cwd=bazel_workspace_dir, - ) - for message in messages.decode().splitlines(): - if message.startswith(MICROCHECK_COMMAND): - tests = tests.union(message[len(MICROCHECK_COMMAND) :].strip().split(" ")) - logger.info(f"Human specified tests: {tests}") - - return tests - - def _get_new_tests(prefix: str, container: TesterContainer) -> Set[str]: """ Get all local test targets that are not in database diff --git a/release/ray_release/test.py b/release/ray_release/test.py index b5d5bfef3b9d..fe1df459fd90 100644 --- a/release/ray_release/test.py +++ b/release/ray_release/test.py @@ -28,6 +28,8 @@ get_write_state_machine_aws_bucket, ) +MICROCHECK_COMMAND = "@microcheck" + AWS_TEST_KEY = "ray_tests" AWS_TEST_RESULT_KEY = "ray_test_results" DEFAULT_PYTHON_VERSION = tuple( @@ -212,6 +214,29 @@ def gen_high_impact_tests(cls, prefix: str) -> Dict[str, List]: return step_id_to_tests + @classmethod + def get_human_specified_tests(cls, bazel_workspace_dir: str) -> Set[str]: + """ + Get all test targets that are specified by humans + """ + base = os.environ.get("BUILDKITE_PULL_REQUEST_BASE_BRANCH") + head = os.environ.get("BUILDKITE_COMMIT") + if not base or not head: + # if not in a PR, return an empty set + return set() + + tests = set() + messages = subprocess.check_output( + ["git", "rev-list", "--format=%b", f"origin/{base}...{head}"], + cwd=bazel_workspace_dir, + ) + for message in messages.decode().splitlines(): + if not message.startswith(MICROCHECK_COMMAND): + continue + tests = tests.union(message[len(MICROCHECK_COMMAND) :].strip().split(" ")) + + return tests + @classmethod def get_changed_tests(cls, bazel_workspace_dir: str) -> Set[str]: """ diff --git a/release/ray_release/tests/test_test.py b/release/ray_release/tests/test_test.py index cb9265aeabcd..2513eb1ea401 100644 --- a/release/ray_release/tests/test_test.py +++ b/release/ray_release/tests/test_test.py @@ -405,5 +405,16 @@ def test_get_changed_tests( assert Test.get_changed_tests("") == {"//t1", "//t2"} +@mock.patch.dict( + os.environ, + {"BUILDKITE_PULL_REQUEST_BASE_BRANCH": "base", "BUILDKITE_COMMIT": "commit"}, +) +@mock.patch("subprocess.check_call") +@mock.patch("subprocess.check_output") +def test_get_human_specified_tests(mock_check_output, mock_check_call) -> None: + mock_check_output.return_value = b"hi\n@microcheck //test01 //test02\nthere" + assert Test.get_human_specified_tests("") == {"//test01", "//test02"} + + if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) From 91a6ed2f11f412eac65de78eef61c229f7aa09ec Mon Sep 17 00:00:00 2001 From: Andrew Sy Kim Date: Sat, 25 May 2024 01:04:26 -0400 Subject: [PATCH 16/65] [Docs][KubeRay] add example for distributed checkpointing with kuberay and gcsfuse (#45043) Signed-off-by: Andrew Sy Kim --- doc/source/cluster/kubernetes/examples.md | 2 + .../distributed-checkpointing-with-gcsfuse.md | 296 ++++++++++++++++++ 2 files changed, 298 insertions(+) create mode 100644 doc/source/cluster/kubernetes/examples/distributed-checkpointing-with-gcsfuse.md diff --git a/doc/source/cluster/kubernetes/examples.md b/doc/source/cluster/kubernetes/examples.md index 7a867f4866a3..10fbe2ed8b71 100644 --- a/doc/source/cluster/kubernetes/examples.md +++ b/doc/source/cluster/kubernetes/examples.md @@ -13,6 +13,7 @@ examples/text-summarizer-rayservice examples/rayjob-batch-inference-example examples/rayjob-kueue-priority-scheduling examples/rayjob-kueue-gang-scheduling +examples/distributed-checkpointing-with-gcsfuse ``` @@ -26,3 +27,4 @@ This section presents example Ray workloads to try out on your Kubernetes cluste - {ref}`kuberay-batch-inference-example` - {ref}`kuberay-kueue-priority-scheduling-example` - {ref}`kuberay-kueue-gang-scheduling-example` +- {ref}`kuberay-distributed-checkpointing-gcsefuse` diff --git a/doc/source/cluster/kubernetes/examples/distributed-checkpointing-with-gcsfuse.md b/doc/source/cluster/kubernetes/examples/distributed-checkpointing-with-gcsfuse.md new file mode 100644 index 000000000000..c8c512095fee --- /dev/null +++ b/doc/source/cluster/kubernetes/examples/distributed-checkpointing-with-gcsfuse.md @@ -0,0 +1,296 @@ +(kuberay-distributed-checkpointing-gcsefuse)= + +# Distributed checkpointing with KubeRay and GCSFuse + +This example orchestrates distributed checkpointing with KubeRay, using the GCSFuse CSI driver and Google Cloud Storage as the remote storage system. +To illustrate the concepts, this guide uses the [Finetuning a Pytorch Image Classifier with Ray Train](https://docs.ray.io/en/latest/train/examples/pytorch/pytorch_resnet_finetune.html) example. + +## Why distributed checkpointing with GCSFuse? + +In large-scale, high-performance machine learning, distributed checkpointing is crucial for fault tolerance, ensuring that if a node fails during training, Ray can resume the process from the latest saved checkpoint instead of starting from scratch. +While it's possible to directly reference remote storage paths (e.g., `gs://my-checkpoint-bucket`), using Google Cloud Storage FUSE (GCSFuse) has distinct advantages for distributed applications. GCSFuse allows you to mount Cloud Storage buckets like local file systems, making checkpoint management more intuitive for distributed applications that rely on these semantics. Furthermore, GCSFuse is designed for high-performance workloads, delivering the performance and scalability you need for distributed checkpointing of large models. + +[Distributed checkpointing](https://docs.ray.io/en/latest/train/user-guides/checkpoints.html), in combination with [GCSFuse](https://cloud.google.com/storage/docs/gcs-fuse), allows for larger-scale model training with increased availability and efficiency. + +## Create a Kubernetes cluster on GKE + +Create a GKE cluster with the [GCSFuse CSI driver](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/cloud-storage-fuse-csi-driver) and [Workload Identity](https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity) enabled, as well as a GPU node pool with 4 L4 GPUs: +``` +export PROJECT_ID= +gcloud container clusters create kuberay-with-gcsfuse \ + --addons GcsFuseCsiDriver \ + --cluster-version=1.29.4 \ + --location=us-east4-c \ + --machine-type=g2-standard-8 \ + --release-channel=rapid \ + --num-nodes=4 \ + --accelerator type=nvidia-l4,count=1,gpu-driver-version=latest \ + --workload-pool=${PROJECT_ID}.svc.id.goog +``` + +Verify the successful creation of your cluster with 4 GPUs: +``` +$ kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu" +NAME GPU +gke-kuberay-with-gcsfuse-default-pool-xxxx-0000 1 +gke-kuberay-with-gcsfuse-default-pool-xxxx-1111 1 +gke-kuberay-with-gcsfuse-default-pool-xxxx-2222 1 +gke-kuberay-with-gcsfuse-default-pool-xxxx-3333 1 +``` + +## Install the KubeRay operator + +Follow [Deploy a KubeRay operator](kuberay-operator-deploy) to install the latest stable KubeRay operator from the Helm repository. +The KubeRay operator Pod must be on the CPU node if you set up the taint for the GPU node pool correctly. + +## Configuring the GCS Bucket + +Create a GCS bucket that Ray uses as the remote filesystem. +``` +BUCKET= +gcloud storage buckets create gs://$BUCKET --uniform-bucket-level-access +``` + +Create a Kubernetes ServiceAccount that grants the RayCluster access to mount the GCS bucket: +``` +kubectl create serviceaccount pytorch-distributed-training +``` + +Bind the `roles/storage.objectUser` role to the Kubernetes service account and bucket IAM policy. +See [Identifying projects](https://cloud.google.com/resource-manager/docs/creating-managing-projects#identifying_projects) to find your project ID and project number: +``` +PROJECT_ID= +PROJECT_NUMBER= +gcloud storage buckets add-iam-policy-binding gs://${BUCKET} --member "principal://iam.googleapis.com/projects/${PROJECT_NUMBER}/locations/global/workloadIdentityPools/${PROJECT_ID}.svc.id.goog/subject/ns/default/sa/pytorch-distributed-training" --role "roles/storage.objectUser" +``` + +See [Access Cloud Storage buckets with the Cloud Storage FUSE CSI driver](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/cloud-storage-fuse-csi-driver) for more details. + +## Deploy the RayJob + +Download the RayJob that executes all the steps documented in [Finetuning a Pytorch Image Classifier with Ray Train](https://docs.ray.io/en/latest/train/examples/pytorch/pytorch_resnet_finetune.html). The [source code](https://github.com/ray-project/kuberay/tree/master/ray-operator/config/samples/pytorch-resnet-image-classifier) is also in the KubeRay repository. + +``` +curl -LO https://raw.githubusercontent.com/ray-project/kuberay/master/ray-operator/config/samples/pytorch-resnet-image-classifier/ray-job.pytorch-image-classifier.yaml +``` + +Modify the RayJob by replacing all instances of the `GCS_BUCKET` placeholder with the Google Cloud Storage bucket you created earlier. Alternatively you can use `sed`: +``` +sed -i "s/GCS_BUCKET/$BUCKET/g" ray-job.pytorch-image-classifier.yaml +``` + +Deploy the RayJob: +``` +kubectl create -f ray-job.pytorch-image-classifier.yaml +``` + +The deployed RayJob includes the following configuration to enable distributed checkpointing to a shared filesystem: +* 4 Ray workers, each with a single GPU. +* All Ray nodes use the `pytorch-distributed-training` ServiceAccount, which we created earlier. +* Includes volumes that are managed by the `gcsfuse.csi.storage.gke.io` CSI driver. +* Mounts a shared storage path `/mnt/cluster_storage`, backed by the GCS bucket you created earlier. + +You can configure the Pod with annotations, which allows for finer grain control of the GCSFuse sidecar container. See [Specify Pod annotations](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/cloud-storage-fuse-csi-driver#pod-annotations) for more details. +``` +annotations: + gke-gcsfuse/volumes: "true" + gke-gcsfuse/cpu-limit: "0" + gke-gcsfuse/memory-limit: 5Gi + gke-gcsfuse/ephemeral-storage-limit: 10Gi +``` + +You can also specify mount options when defining the GCSFuse container volume: +``` +csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: GCS_BUCKET + mountOptions: "implicit-dirs,uid=1000,gid=100" +``` + +See [Mount options](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/cloud-storage-fuse-csi-driver#mount-options) to learn more about mount options. + +Logs from the Ray job should indicate the use of the shared remote filesystem in `/mnt/cluster_storage` and the checkpointing directory. For example: +``` +Training finished iteration 10 at 2024-04-29 10:22:08. Total running time: 1min 30s +╭─────────────────────────────────────────╮ +│ Training result │ +├─────────────────────────────────────────┤ +│ checkpoint_dir_name checkpoint_000009 │ +│ time_this_iter_s 6.47154 │ +│ time_total_s 74.5547 │ +│ training_iteration 10 │ +│ acc 0.24183 │ +│ loss 0.06882 │ +╰─────────────────────────────────────────╯ +Training saved a checkpoint for iteration 10 at: (local)/mnt/cluster_storage/finetune-resnet/TorchTrainer_cbb82_00000_0_2024-04-29_10-20-37/checkpoint_000009 +``` + +## Inspect checkpointing data + +Once the RayJob completes, you can inspect the contents of your bucket using a tool like [gsutil](https://cloud.google.com/storage/docs/gsutil). +``` +gsutil ls gs://my-ray-bucket/** +gs://my-ray-bucket/finetune-resnet/ +gs://my-ray-bucket/finetune-resnet/.validate_storage_marker +gs://my-ray-bucket/finetune-resnet/TorchTrainer_96923_00000_0_2024-04-29_17-21-29/ +gs://my-ray-bucket/finetune-resnet/TorchTrainer_96923_00000_0_2024-04-29_17-21-29/checkpoint_000007/ +gs://my-ray-bucket/finetune-resnet/TorchTrainer_96923_00000_0_2024-04-29_17-21-29/checkpoint_000007/checkpoint.pt +gs://my-ray-bucket/finetune-resnet/TorchTrainer_96923_00000_0_2024-04-29_17-21-29/checkpoint_000008/ +gs://my-ray-bucket/finetune-resnet/TorchTrainer_96923_00000_0_2024-04-29_17-21-29/checkpoint_000008/checkpoint.pt +gs://my-ray-bucket/finetune-resnet/TorchTrainer_96923_00000_0_2024-04-29_17-21-29/checkpoint_000009/ +gs://my-ray-bucket/finetune-resnet/TorchTrainer_96923_00000_0_2024-04-29_17-21-29/checkpoint_000009/checkpoint.pt +gs://my-ray-bucket/finetune-resnet/TorchTrainer_96923_00000_0_2024-04-29_17-21-29/error.pkl +gs://my-ray-bucket/finetune-resnet/TorchTrainer_96923_00000_0_2024-04-29_17-21-29/error.txt +gs://my-ray-bucket/finetune-resnet/TorchTrainer_96923_00000_0_2024-04-29_17-21-29/events.out.tfevents.1714436502.orch-image-classifier-nc2sq-raycluster-tdrfx-head-xzcl8 +gs://my-ray-bucket/finetune-resnet/TorchTrainer_96923_00000_0_2024-04-29_17-21-29/events.out.tfevents.1714436809.orch-image-classifier-zz4sj-raycluster-vn7kz-head-lwx8k +gs://my-ray-bucket/finetune-resnet/TorchTrainer_96923_00000_0_2024-04-29_17-21-29/params.json +gs://my-ray-bucket/finetune-resnet/TorchTrainer_96923_00000_0_2024-04-29_17-21-29/params.pkl +gs://my-ray-bucket/finetune-resnet/TorchTrainer_96923_00000_0_2024-04-29_17-21-29/progress.csv +gs://my-ray-bucket/finetune-resnet/TorchTrainer_96923_00000_0_2024-04-29_17-21-29/result.json +gs://my-ray-bucket/finetune-resnet/basic-variant-state-2024-04-29_17-21-29.json +gs://my-ray-bucket/finetune-resnet/basic-variant-state-2024-04-29_17-26-35.json +gs://my-ray-bucket/finetune-resnet/experiment_state-2024-04-29_17-21-29.json +gs://my-ray-bucket/finetune-resnet/experiment_state-2024-04-29_17-26-35.json +gs://my-ray-bucket/finetune-resnet/trainer.pkl +gs://my-ray-bucket/finetune-resnet/tuner.pkl + +``` + +## Resuming from checkpoint + +In the event of a failed job, you can use the latest checkpoint to resume training of the model. This example configures `TorchTrainer` to automatically resume +from the latest checkpoint: +```python +experiment_path = os.path.expanduser("/mnt/cluster_storage/finetune-resnet") +if TorchTrainer.can_restore(experiment_path): + trainer = TorchTrainer.restore(experiment_path, + train_loop_per_worker=train_loop_per_worker, + train_loop_config=train_loop_config, + scaling_config=scaling_config, + run_config=run_config, + ) +else: + trainer = TorchTrainer( + train_loop_per_worker=train_loop_per_worker, + train_loop_config=train_loop_config, + scaling_config=scaling_config, + run_config=run_config, + ) +``` + +You can verify automatic checkpoint recovery by redeploying the same RayJob: +``` +kubectl create -f ray-job.pytorch-image-classifier.yaml +``` + +If the previous job succeeded, the training job should restore the checkpoint state from the `checkpoint_000009` directory +and then immediately complete training with 0 iterations: +``` +2024-04-29 15:51:32,528 INFO experiment_state.py:366 -- Trying to find and download experiment checkpoint at /mnt/cluster_storage/finetune-resnet +2024-04-29 15:51:32,651 INFO experiment_state.py:396 -- A remote experiment checkpoint was found and will be used to restore the previous experiment state. +2024-04-29 15:51:32,652 INFO tune_controller.py:404 -- Using the newest experiment state file found within the experiment directory: experiment_state-2024-04-29_15-43-40.json + +View detailed results here: /mnt/cluster_storage/finetune-resnet +To visualize your results with TensorBoard, run: `tensorboard --logdir /home/ray/ray_results/finetune-resnet` + +Result( + metrics={'loss': 0.070047477101968, 'acc': 0.23529411764705882}, + path='/mnt/cluster_storage/finetune-resnet/TorchTrainer_ecc04_00000_0_2024-04-29_15-43-40', + filesystem='local', + checkpoint=Checkpoint(filesystem=local, path=/mnt/cluster_storage/finetune-resnet/TorchTrainer_ecc04_00000_0_2024-04-29_15-43-40/checkpoint_000009) +) +``` + +If the previous job failed at an earlier checkpoint, the job should resume from the last saved checkpoint and run until `max_epochs=10`. For example, if the last run +failed at epoch 7, the training automatically resumes using `checkpoint_000006` and run 3 more iterations until epoch 10: +``` +(TorchTrainer pid=611, ip=10.108.2.65) Restored on 10.108.2.65 from checkpoint: Checkpoint(filesystem=local, path=/mnt/cluster_storage/finetune-resnet/TorchTrainer_96923_00000_0_2024-04-29_17-21-29/checkpoint_000006) +(RayTrainWorker pid=671, ip=10.108.2.65) Setting up process group for: env:// [rank=0, world_size=4] +(TorchTrainer pid=611, ip=10.108.2.65) Started distributed worker processes: +(TorchTrainer pid=611, ip=10.108.2.65) - (ip=10.108.2.65, pid=671) world_rank=0, local_rank=0, node_rank=0 +(TorchTrainer pid=611, ip=10.108.2.65) - (ip=10.108.1.83, pid=589) world_rank=1, local_rank=0, node_rank=1 +(TorchTrainer pid=611, ip=10.108.2.65) - (ip=10.108.0.72, pid=590) world_rank=2, local_rank=0, node_rank=2 +(TorchTrainer pid=611, ip=10.108.2.65) - (ip=10.108.3.76, pid=590) world_rank=3, local_rank=0, node_rank=3 +(RayTrainWorker pid=589, ip=10.108.1.83) Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /home/ray/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth +(RayTrainWorker pid=671, ip=10.108.2.65) + 0%| | 0.00/97.8M [00:00 Date: Sat, 25 May 2024 00:42:33 -0700 Subject: [PATCH 17/65] [Data] Add `override_num_blocks` to `from_pandas` and perform auto-partition (#44937) A common pattern is to load a DataFrame containing file URIs with from_pandas and then loading those URIs with map_batches. If you have a single large DataFrame, the subsequent operator (e.g., for reading) won't be parallelized because from_pandas produces one input block. To fix this issue, this PR automatically splits DataFrames into a good number of blocks, and allows the user to override the number of blocks. Signed-off-by: Balaji Veeramani --- python/ray/data/__init__.py | 1 + .../logical/operators/from_operators.py | 6 ++ python/ray/data/_internal/pandas_block.py | 34 ++++++++++- python/ray/data/_internal/util.py | 4 +- python/ray/data/read_api.py | 59 ++++++++++++++++++- python/ray/data/tests/test_consumption.py | 25 +++++++- python/ray/data/tests/test_csv.py | 4 +- python/ray/data/tests/test_ecosystem.py | 6 +- python/ray/data/tests/test_json.py | 4 +- python/ray/data/tests/test_map.py | 4 +- python/ray/data/tests/test_pandas.py | 27 ++++++++- python/ray/data/tests/test_parquet.py | 4 +- python/ray/data/tests/test_sort.py | 4 +- 13 files changed, 159 insertions(+), 23 deletions(-) diff --git a/python/ray/data/__init__.py b/python/ray/data/__init__.py index 20d2a9d2c263..901a282775c5 100644 --- a/python/ray/data/__init__.py +++ b/python/ray/data/__init__.py @@ -26,6 +26,7 @@ from ray.data.read_api import ( # noqa: F401 from_arrow, from_arrow_refs, + from_blocks, from_dask, from_huggingface, from_items, diff --git a/python/ray/data/_internal/logical/operators/from_operators.py b/python/ray/data/_internal/logical/operators/from_operators.py index 057c07f9f7f6..146a8b7eac15 100644 --- a/python/ray/data/_internal/logical/operators/from_operators.py +++ b/python/ray/data/_internal/logical/operators/from_operators.py @@ -42,6 +42,12 @@ class FromItems(AbstractFrom): pass +class FromBlocks(AbstractFrom): + """Logical operator for `from_blocks`.""" + + pass + + class FromNumpy(AbstractFrom): """Logical operator for `from_numpy`.""" diff --git a/python/ray/data/_internal/pandas_block.py b/python/ray/data/_internal/pandas_block.py index 974c9d3733eb..599ebff222c9 100644 --- a/python/ray/data/_internal/pandas_block.py +++ b/python/ray/data/_internal/pandas_block.py @@ -271,7 +271,9 @@ def to_numpy( def to_arrow(self) -> "pyarrow.Table": import pyarrow - return pyarrow.table(self._table) + # Set `preserve_index=False` so that Arrow doesn't add a '__index_level_0__' + # column to the resulting table. + return pyarrow.Table.from_pandas(self._table, preserve_index=False) @staticmethod def numpy_to_block( @@ -632,3 +634,33 @@ def gen(): def block_type(self) -> BlockType: return BlockType.PANDAS + + +def _estimate_dataframe_size(df: "pandas.DataFrame") -> int: + """Estimate the size of a pandas DataFrame. + + This function is necessary because `DataFrame.memory_usage` doesn't count values in + columns with `dtype=object`. + + The runtime complexity is linear in the number of values, so don't use this in + performance-critical code. + + Args: + df: The DataFrame to estimate the size of. + + Returns: + The estimated size of the DataFrame in bytes. + """ + size = 0 + for column in df.columns: + if df[column].dtype == object: + for item in df[column]: + if isinstance(item, str): + size += len(item) + elif isinstance(item, np.ndarray): + size += item.nbytes + else: + size += 8 # pandas assumes object values are 8 bytes. + else: + size += df[column].nbytes + return size diff --git a/python/ray/data/_internal/util.py b/python/ray/data/_internal/util.py index 954be73b508e..c91f34f1214b 100644 --- a/python/ray/data/_internal/util.py +++ b/python/ray/data/_internal/util.py @@ -649,10 +649,8 @@ def capitalize(s: str): def pandas_df_to_arrow_block(df: "pandas.DataFrame") -> "Block": from ray.data.block import BlockAccessor, BlockExecStats + block = BlockAccessor.for_block(df).to_arrow() stats = BlockExecStats.builder() - import pyarrow as pa - - block = pa.table(df) return ( block, BlockAccessor.for_block(block).get_metadata( diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py index aeb6240015b9..d30045cb3085 100644 --- a/python/ray/data/read_api.py +++ b/python/ray/data/read_api.py @@ -1,5 +1,6 @@ import collections import logging +import math import os import warnings from typing import ( @@ -25,12 +26,14 @@ from ray.data._internal.lazy_block_list import LazyBlockList from ray.data._internal.logical.operators.from_operators import ( FromArrow, + FromBlocks, FromItems, FromNumpy, FromPandas, ) from ray.data._internal.logical.operators.read_operator import Read from ray.data._internal.logical.optimizers import LogicalPlan +from ray.data._internal.pandas_block import _estimate_dataframe_size from ray.data._internal.plan import ExecutionPlan from ray.data._internal.remote_fn import cached_remote_fn from ray.data._internal.stats import DatasetStats @@ -104,6 +107,37 @@ logger = logging.getLogger(__name__) +@DeveloperAPI +def from_blocks(blocks: List[Block]): + """Create a :class:`~ray.data.Dataset` from a list of blocks. + + This method is primarily used for testing. Unlike other methods like + :func:`~ray.data.from_pandas` and :func:`~ray.data.from_arrow`, this method + gaurentees that it won't modify the number of blocks. + + Args: + blocks: List of blocks to create the dataset from. + + Returns: + A :class:`~ray.data.Dataset` holding the blocks. + """ + block_refs = [ray.put(block) for block in blocks] + metadata = [ + BlockAccessor.for_block(block).get_metadata(input_files=None, exec_stats=None) + for block in blocks + ] + from_blocks_op = FromBlocks(block_refs, metadata) + logical_plan = LogicalPlan(from_blocks_op) + return MaterializedDataset( + ExecutionPlan( + BlockList(block_refs, metadata, owned_by_consumer=False), + DatasetStats(metadata={"FromBlocks": metadata}, parent=None), + run_by_consumer=False, + ), + logical_plan, + ) + + @PublicAPI def from_items( items: List[Any], @@ -2359,7 +2393,8 @@ def from_modin(df: "modin.pandas.dataframe.DataFrame") -> MaterializedDataset: @PublicAPI def from_pandas( - dfs: Union["pandas.DataFrame", List["pandas.DataFrame"]] + dfs: Union["pandas.DataFrame", List["pandas.DataFrame"]], + override_num_blocks: Optional[int] = None, ) -> MaterializedDataset: """Create a :class:`~ray.data.Dataset` from a list of pandas dataframes. @@ -2373,10 +2408,14 @@ def from_pandas( Create a Ray Dataset from a list of Pandas DataFrames. >>> ray.data.from_pandas([df, df]) - MaterializedDataset(num_blocks=2, num_rows=6, schema={a: int64, b: int64}) + MaterializedDataset(num_blocks=1, num_rows=6, schema={a: int64, b: int64}) Args: dfs: A pandas dataframe or a list of pandas dataframes. + override_num_blocks: Override the number of output blocks from all read tasks. + By default, the number of output blocks is dynamically decided based on + input data size and available resources. You shouldn't manually set this + value in most cases. Returns: :class:`~ray.data.Dataset` holding data read from the dataframes. @@ -2386,13 +2425,27 @@ def from_pandas( if isinstance(dfs, pd.DataFrame): dfs = [dfs] + context = DataContext.get_current() + num_blocks = override_num_blocks + if num_blocks is None: + total_size = sum(_estimate_dataframe_size(df) for df in dfs) + num_blocks = max(math.ceil(total_size / context.target_max_block_size), 1) + + if len(dfs) > 1: + # I assume most users pass a single DataFrame as input. For simplicity, I'm + # concatenating DataFrames, even though it's not efficient. + ary = pd.concat(dfs, axis=0) + else: + ary = dfs[0] + dfs = np.array_split(ary, num_blocks) + from ray.air.util.data_batch_conversion import ( _cast_ndarray_columns_to_tensor_extension, ) - context = DataContext.get_current() if context.enable_tensor_extension_casting: dfs = [_cast_ndarray_columns_to_tensor_extension(df.copy()) for df in dfs] + return from_pandas_refs([ray.put(df) for df in dfs]) diff --git a/python/ray/data/tests/test_consumption.py b/python/ray/data/tests/test_consumption.py index 5581e0265017..5151d9b65e4b 100644 --- a/python/ray/data/tests/test_consumption.py +++ b/python/ray/data/tests/test_consumption.py @@ -679,6 +679,27 @@ def test_convert_types(ray_start_regular_shared): assert arrow_ds.map(lambda x: {"a": (x["id"],)}).take() == [{"a": [0]}] +@pytest.mark.parametrize( + "input_blocks", + [ + [pd.DataFrame({"column": ["spam"]}), pd.DataFrame({"column": ["ham", "eggs"]})], + [ + pa.Table.from_pydict({"column": ["spam"]}), + pa.Table.from_pydict({"column": ["ham", "eggs"]}), + ], + ], +) +def test_from_blocks(input_blocks, ray_start_regular_shared): + ds = ray.data.from_blocks(input_blocks) + + output_blocks = [ray.get(block_ref) for block_ref in ds.get_internal_block_refs()] + assert len(input_blocks) == len(output_blocks) + assert all( + input_block.equals(output_block) + for input_block, output_block in zip(input_blocks, output_blocks) + ) + + def test_from_items(ray_start_regular_shared): ds = ray.data.from_items(["hello", "world"]) assert extract_values("item", ds.take()) == ["hello", "world"] @@ -781,7 +802,7 @@ def test_iter_batches_basic(ray_start_regular_shared): df3 = pd.DataFrame({"one": [7, 8, 9], "two": [8, 9, 10]}) df4 = pd.DataFrame({"one": [10, 11, 12], "two": [11, 12, 13]}) dfs = [df1, df2, df3, df4] - ds = ray.data.from_pandas(dfs) + ds = ray.data.from_blocks(dfs) # Default. for batch, df in zip(ds.iter_batches(batch_size=None, batch_format="pandas"), dfs): @@ -1179,7 +1200,7 @@ def test_iter_batches_grid(ray_start_regular_shared): ) running_size += block_size num_rows = running_size - ds = ray.data.from_pandas(dfs) + ds = ray.data.from_blocks(dfs) for batch_size in np.random.randint( 1, num_rows + 1, size=batch_size_samples ): diff --git a/python/ray/data/tests/test_csv.py b/python/ray/data/tests/test_csv.py index 5d207f8d4b18..be8bfac48312 100644 --- a/python/ray/data/tests/test_csv.py +++ b/python/ray/data/tests/test_csv.py @@ -689,7 +689,7 @@ def test_csv_write(ray_start_regular_shared, fs, data_path, endpoint_url): storage_options = dict(client_kwargs=dict(endpoint_url=endpoint_url)) # Single block. df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) - ds = ray.data.from_pandas([df1]) + ds = ray.data.from_blocks([df1]) ds._set_uuid("data") ds.write_csv(data_path, filesystem=fs) file_path = os.path.join(data_path, "data_000000_000000.csv") @@ -697,7 +697,7 @@ def test_csv_write(ray_start_regular_shared, fs, data_path, endpoint_url): # Two blocks. df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) - ds = ray.data.from_pandas([df1, df2]) + ds = ray.data.from_blocks([df1, df2]) ds._set_uuid("data") ds.write_csv(data_path, filesystem=fs) file_path2 = os.path.join(data_path, "data_000001_000000.csv") diff --git a/python/ray/data/tests/test_ecosystem.py b/python/ray/data/tests/test_ecosystem.py index 17df126ccee6..4d124907ff44 100644 --- a/python/ray/data/tests/test_ecosystem.py +++ b/python/ray/data/tests/test_ecosystem.py @@ -33,7 +33,7 @@ def test_to_dask(ray_start_regular_shared, ds_format): df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) df = pd.concat([df1, df2]) - ds = ray.data.from_pandas([df1, df2]) + ds = ray.data.from_blocks([df1, df2]) if ds_format == "arrow": ds = ds.map_batches(lambda df: df, batch_format="pyarrow", batch_size=None) ddf = ds.to_dask() @@ -52,7 +52,7 @@ def test_to_dask(ray_start_regular_shared, ds_format): df1["two"] = df1["two"].astype(pd.StringDtype()) df2["two"] = df2["two"].astype(pd.StringDtype()) df = pd.concat([df1, df2]) - ds = ray.data.from_pandas([df1, df2]) + ds = ray.data.from_blocks([df1, df2]) if ds_format == "arrow": ds = ds.map_batches(lambda df: df, batch_format="pyarrow", batch_size=None) ddf = ds.to_dask( @@ -76,7 +76,7 @@ def test_to_dask(ray_start_regular_shared, ds_format): df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) df2 = pd.DataFrame({"three": [4, 5, 6], "four": ["e", "f", "g"]}) df = pd.concat([df1, df2]) - ds = ray.data.from_pandas([df1, df2]) + ds = ray.data.from_blocks([df1, df2]) if ds_format == "arrow": ds = ds.map_batches(lambda df: df, batch_format="pyarrow", batch_size=None) ddf = ds.to_dask(verify_meta=False) diff --git a/python/ray/data/tests/test_json.py b/python/ray/data/tests/test_json.py index 1a90069fc7bc..789fb9e3b89c 100644 --- a/python/ray/data/tests/test_json.py +++ b/python/ray/data/tests/test_json.py @@ -488,7 +488,7 @@ def test_json_write(ray_start_regular_shared, fs, data_path, endpoint_url): storage_options = dict(client_kwargs=dict(endpoint_url=endpoint_url)) # Single block. df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) - ds = ray.data.from_pandas([df1]) + ds = ray.data.from_blocks([df1]) ds._set_uuid("data") ds.write_json(data_path, filesystem=fs) file_path = os.path.join(data_path, "data_000000_000000.json") @@ -500,7 +500,7 @@ def test_json_write(ray_start_regular_shared, fs, data_path, endpoint_url): # Two blocks. df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) - ds = ray.data.from_pandas([df1, df2]) + ds = ray.data.from_blocks([df1, df2]) ds._set_uuid("data") ds.write_json(data_path, filesystem=fs) file_path2 = os.path.join(data_path, "data_000001_000000.json") diff --git a/python/ray/data/tests/test_map.py b/python/ray/data/tests/test_map.py index 38a6aacdd8a5..a511eb7255c3 100644 --- a/python/ray/data/tests/test_map.py +++ b/python/ray/data/tests/test_map.py @@ -830,7 +830,7 @@ def test_map_batches_block_bundling_skewed_manual( ray_start_regular_shared, block_sizes, batch_size, expected_num_blocks ): num_blocks = len(block_sizes) - ds = ray.data.from_pandas( + ds = ray.data.from_blocks( [pd.DataFrame({"a": [1] * block_size}) for block_size in block_sizes] ) # Confirm that we have the expected number of initial blocks. @@ -856,7 +856,7 @@ def test_map_batches_block_bundling_skewed_auto( ray_start_regular_shared, block_sizes, batch_size ): num_blocks = len(block_sizes) - ds = ray.data.from_pandas( + ds = ray.data.from_blocks( [pd.DataFrame({"a": [1] * block_size}) for block_size in block_sizes] ) # Confirm that we have the expected number of initial blocks. diff --git a/python/ray/data/tests/test_pandas.py b/python/ray/data/tests/test_pandas.py index ec27c296a296..9bb157789ef4 100644 --- a/python/ray/data/tests/test_pandas.py +++ b/python/ray/data/tests/test_pandas.py @@ -48,6 +48,31 @@ def test_from_pandas(ray_start_regular_shared, enable_pandas_block): ctx.enable_pandas_block = old_enable_pandas_block +def test_from_pandas_default_num_blocks(ray_start_regular_shared, restore_data_context): + ray.data.DataContext.get_current().target_max_block_size = 8 * 1024 * 1024 # 8 MiB + + record = {"number": 0, "string": "\0"} + record_size_bytes = 8 + 1 # 8 bytes for int64 and 1 byte for char + dataframe_size_bytes = 64 * 1024 * 1024 # 64 MiB + num_records = int(dataframe_size_bytes / record_size_bytes) + df = pd.DataFrame.from_records([record] * num_records) + + ds = ray.data.from_pandas(df) + + # If the target block size is 8 MiB, the DataFrame should be split into + # 64 MiB / (8 MiB / block) = 8 blocks. + assert ds.materialize().num_blocks() == 8 + + +@pytest.mark.parametrize("num_inputs", [1, 2]) +def test_from_pandas_override_num_blocks(num_inputs, ray_start_regular_shared): + df = pd.DataFrame({"number": [0]}) + + ds = ray.data.from_pandas([df] * num_inputs, override_num_blocks=2) + + assert ds.materialize().num_blocks() == 2 + + @pytest.mark.parametrize("enable_pandas_block", [False, True]) def test_from_pandas_refs(ray_start_regular_shared, enable_pandas_block): ctx = ray.data.context.DataContext.get_current() @@ -113,7 +138,7 @@ def test_to_pandas_refs(ray_start_regular_shared): def test_pandas_roundtrip(ray_start_regular_shared, tmp_path): df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) - ds = ray.data.from_pandas([df1, df2]) + ds = ray.data.from_pandas([df1, df2], override_num_blocks=2) dfds = ds.to_pandas() assert pd.concat([df1, df2], ignore_index=True).equals(dfds) diff --git a/python/ray/data/tests/test_parquet.py b/python/ray/data/tests/test_parquet.py index 1e8c5c931a19..d9ecfdb0e09f 100644 --- a/python/ray/data/tests/test_parquet.py +++ b/python/ray/data/tests/test_parquet.py @@ -875,7 +875,7 @@ def test_parquet_write(ray_start_regular_shared, fs, data_path, endpoint_url): df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) df = pd.concat([df1, df2]) - ds = ray.data.from_pandas([df1, df2]) + ds = ray.data.from_blocks([df1, df2]) path = os.path.join(data_path, "test_parquet_dir") if fs is None: os.mkdir(path) @@ -928,7 +928,7 @@ def test_parquet_write_create_dir( df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) df = pd.concat([df1, df2]) - ds = ray.data.from_pandas([df1, df2]) + ds = ray.data.from_blocks([df1, df2]) path = os.path.join(data_path, "test_parquet_dir") # Set the uuid to a known value so that we can easily get the parquet file names. data_key = "data" diff --git a/python/ray/data/tests/test_sort.py b/python/ray/data/tests/test_sort.py index a0d0eb1f5c67..d66bf8f0d7a0 100644 --- a/python/ray/data/tests/test_sort.py +++ b/python/ray/data/tests/test_sort.py @@ -119,7 +119,7 @@ def test_sort_arrow( offset += shard if offset < num_items: dfs.append(pd.DataFrame({"a": a[offset:], "b": b[offset:]})) - ds = ray.data.from_pandas(dfs).map_batches( + ds = ray.data.from_blocks(dfs).map_batches( lambda t: t, batch_format="pyarrow", batch_size=None ) @@ -235,7 +235,7 @@ def test_sort_pandas(ray_start_regular, num_items, parallelism, use_push_based_s offset += shard if offset < num_items: dfs.append(pd.DataFrame({"a": a[offset:], "b": b[offset:]})) - ds = ray.data.from_pandas(dfs) + ds = ray.data.from_blocks(dfs) def assert_sorted(sorted_ds, expected_rows): assert [tuple(row.values()) for row in sorted_ds.iter_rows()] == list( From 93dce13cbcd6871afd0ed61bd46cd989945667f3 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Sat, 25 May 2024 20:56:50 -0700 Subject: [PATCH 18/65] [core][experimental] Support NCCL-based torch.Tensors nested inside Python objects (#45473) Allows torch.Tensors nested inside Python objects to be transferred via NCCL using the following syntax: ```python with InputNode() as inp: dag = sender.send.bind(inp) dag = dag.with_type_hint(TorchTensorType(transport="nccl")) dag = receiver.recv.bind(dag) ``` We implement this by using an additional shared memory channel to pass CPU data, with a "nested" NCCL channel to pass the GPU data. Here is the send procedure for the above code: 1. Serialize the data. Extract out all tensors that are on the GPU and replace them with some kind of placeholder. 2. Send a list of metadata through the meta_channel. 3. Send the GPU tensors through the NCCL channel. 4. Send the rest of the CPU data through a cpu_data_channel, with the placeholders for the GPU tensors. Note that if the TorchTensorType doesn't have a shape and dtype specified, we currently use the separate meta_channel to pass metadata for the serialized tensors, as introduced in #45332. To elide the cpu_data_channel, the user should now use `TorchTensorType(direct_return=True)`, to indicate that no CPU data is sent along with the GPU data. To elide the meta_channel, the user should declare the shape and dtype, e.g., `TorchTensorType(shape=(10, ), dtype=torch.float16)`. ## Related issue number Closes #45306. --------- Signed-off-by: Stephanie Wang Co-authored-by: SangBin Cho --- python/ray/dag/compiled_dag_node.py | 15 +- python/ray/dag/dag_node.py | 11 +- .../experimental/test_torch_tensor_dag.py | 189 +++++++++++++++- python/ray/experimental/channel/common.py | 44 +++- .../channel/serialization_context.py | 37 ++- .../channel/shared_memory_channel.py | 56 ++++- .../channel/torch_tensor_nccl_channel.py | 213 ++++++++++++++++-- .../experimental/channel/torch_tensor_type.py | 36 ++- 8 files changed, 549 insertions(+), 52 deletions(-) diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py index c40e1bf8471b..e0cb598d9444 100644 --- a/python/ray/dag/compiled_dag_node.py +++ b/python/ray/dag/compiled_dag_node.py @@ -115,8 +115,8 @@ def _prep_task(self, task: "ExecutableTask") -> None: else: task.resolved_inputs.append(inp) - for type_hint in task.input_type_hints: - type_hint.register_custom_serializer() + for typ_hint in task.input_type_hints: + typ_hint.register_custom_serializer() task.output_type_hint.register_custom_serializer() input_reader: ReaderInterface = SynchronousReader(task.input_channels) @@ -766,13 +766,16 @@ def teardown(self, wait: bool): for actor in outer.actor_refs: logger.info(f"Cancelling compiled worker on actor: {actor}") - for actor, tasks in outer.actor_to_executable_tasks.items(): + # Cancel all actor loops in parallel. + cancel_refs = [ + actor.__ray_call__.remote(do_cancel_executable_tasks, tasks) + for actor, tasks in outer.actor_to_executable_tasks.items() + ] + for cancel_ref in cancel_refs: try: # TODO(swang): Suppress exceptions from actors trying to # read closed channels when DAG is being torn down. - ray.get( - actor.__ray_call__.remote(do_cancel_executable_tasks, tasks) - ) + ray.get(cancel_ref, timeout=30) except Exception: logger.exception("Error cancelling worker task") pass diff --git a/python/ray/dag/dag_node.py b/python/ray/dag/dag_node.py index 3c66ddd20869..ce48bcc51333 100644 --- a/python/ray/dag/dag_node.py +++ b/python/ray/dag/dag_node.py @@ -66,7 +66,16 @@ def __init__( self._type_hint: Optional[ChannelOutputType] = ChannelOutputType() def with_type_hint(self, typ: ChannelOutputType): - self._type_hint = copy.deepcopy(typ) + if typ.is_direct_return: + old_contains_typ = self._type_hint.contains_type + self._type_hint = copy.deepcopy(typ) + if old_contains_typ is not None and typ.contains_type is None: + # The contained type was set before the return + # type, and the new return type doesn't have a + # contained type set. + self._type_hint.set_contains_type(old_contains_typ) + else: + self._type_hint.set_contains_type(typ) return self @property diff --git a/python/ray/dag/tests/experimental/test_torch_tensor_dag.py b/python/ray/dag/tests/experimental/test_torch_tensor_dag.py index 16a97f1a7a12..b0c9efe02615 100644 --- a/python/ray/dag/tests/experimental/test_torch_tensor_dag.py +++ b/python/ray/dag/tests/experimental/test_torch_tensor_dag.py @@ -3,6 +3,7 @@ import os import sys import torch +import time import pytest @@ -78,7 +79,9 @@ def test_torch_tensor_p2p(ray_start_regular): # Test torch.Tensor sent between actors. with InputNode() as inp: dag = sender.send.bind(shape, dtype, inp) - dag = dag.with_type_hint(TorchTensorType(shape, dtype)) + # TODO(swang): Test that we are using the minimum number of + # channels/messages when direct_return=True. + dag = dag.with_type_hint(TorchTensorType(shape, dtype, direct_return=True)) dag = receiver.recv.bind(dag) compiled_dag = dag.experimental_compile() @@ -94,7 +97,9 @@ def test_torch_tensor_p2p(ray_start_regular): # Passing tensors of a similar or smaller shape is okay. with InputNode() as inp: dag = sender.send.bind(shape, dtype, inp) - dag = dag.with_type_hint(TorchTensorType((20,), dtype)) + # TODO(swang): Test that we are using the minimum number of + # channels/messages when direct_return=True. + dag = dag.with_type_hint(TorchTensorType((20,), dtype, direct_return=True)) dag = receiver.recv.bind(dag) compiled_dag = dag.experimental_compile() for i in range(3): @@ -108,7 +113,9 @@ def test_torch_tensor_p2p(ray_start_regular): # Passing a much larger tensor will error. with InputNode() as inp: dag = sender.send.bind(1_000_000, dtype, inp) - dag = dag.with_type_hint(TorchTensorType(shape, dtype)) + # TODO(swang): Test that we are using the minimum number of + # channels/messages when direct_return=True. + dag = dag.with_type_hint(TorchTensorType(shape, dtype, direct_return=True)) dag = receiver.recv.bind(dag) compiled_dag = dag.experimental_compile() output_channel = compiled_dag.execute(1) @@ -116,6 +123,26 @@ def test_torch_tensor_p2p(ray_start_regular): result = output_channel.begin_read() compiled_dag.teardown() + # Passing a torch.tensor inside of other data is okay even if + # direct_return=True, if `transport` is not set. + with InputNode() as inp: + dag = sender.send_dict_with_tuple_args.bind(inp) + dag = dag.with_type_hint( + TorchTensorType( + shape=shape, + dtype=dtype, + direct_return=True, + ) + ) + dag = receiver.recv_dict.bind(dag) + + compiled_dag = dag.experimental_compile() + + output_channel = compiled_dag.execute((shape, dtype, 1)) + output_channel.begin_read() + output_channel.end_read() + compiled_dag.teardown() + @pytest.mark.parametrize("ray_start_regular", [{"num_cpus": 4}], indirect=True) def test_torch_tensor_as_dag_input(ray_start_regular): @@ -133,7 +160,11 @@ def test_torch_tensor_as_dag_input(ray_start_regular): # Test torch.Tensor as input. with InputNode() as inp: - torch_inp = inp.with_type_hint(TorchTensorType(shape, dtype)) + # TODO(swang): Test that we are using the minimum number of + # channels/messages when direct_return=True. + torch_inp = inp.with_type_hint( + TorchTensorType(shape, dtype, direct_return=True) + ) dag = receiver.recv.bind(torch_inp) compiled_dag = dag.experimental_compile() @@ -178,7 +209,11 @@ def test_torch_tensor_nccl(ray_start_regular): with InputNode() as inp: dag = sender.send.bind(shape, dtype, inp) - dag = dag.with_type_hint(TorchTensorType(shape, dtype, transport="nccl")) + # TODO(swang): Test that we are using the minimum number of + # channels/messages when direct_return=True. + dag = dag.with_type_hint( + TorchTensorType(shape, dtype, transport="nccl", direct_return=True) + ) dag = receiver.recv.bind(dag) # Test normal execution. @@ -243,7 +278,9 @@ def test_torch_tensor_nccl_dynamic(ray_start_regular): with InputNode() as inp: dag = sender.send_with_tuple_args.bind(inp) - dag = dag.with_type_hint(TorchTensorType(transport="nccl")) + # TODO(swang): Test that we are using the minimum number of + # channels/messages when direct_return=True. + dag = dag.with_type_hint(TorchTensorType(transport="nccl", direct_return=True)) dag = receiver.recv.bind(dag) compiled_dag = dag.experimental_compile() @@ -281,7 +318,13 @@ def test_torch_tensor_nccl_wrong_shape(ray_start_regular): # Passing tensors of the wrong shape will error. with InputNode() as inp: dag = sender.send.bind(shape, dtype, inp) - dag = dag.with_type_hint(TorchTensorType((20,), dtype, transport="nccl")) + dag = dag.with_type_hint( + TorchTensorType( + (20,), + dtype, + transport="nccl", + ) + ) dag = receiver.recv.bind(dag) compiled_dag = dag.experimental_compile() @@ -292,8 +335,136 @@ def test_torch_tensor_nccl_wrong_shape(ray_start_regular): compiled_dag.teardown() - ray.kill(sender) - ray.kill(receiver) + # TODO(swang): This currently requires time.sleep to avoid some issue with + # following tests. + time.sleep(3) + + +@pytest.mark.parametrize("ray_start_regular", [{"num_cpus": 4}], indirect=True) +def test_torch_tensor_nccl_nested(ray_start_regular): + """ + Test nested torch.Tensor passed via NCCL. Its shape and dtype is statically + declared. + """ + if not USE_GPU: + pytest.skip("NCCL tests require GPUs") + + assert ( + sum(node["Resources"].get("GPU", 0) for node in ray.nodes()) > 1 + ), "This test requires at least 2 GPUs" + + actor_cls = TorchTensorWorker.options(num_gpus=1) + + sender = actor_cls.remote() + receiver = actor_cls.remote() + + shape = (10,) + dtype = torch.float16 + + with InputNode() as inp: + dag = sender.send_dict_with_tuple_args.bind(inp) + dag = dag.with_type_hint( + TorchTensorType(shape=shape, dtype=dtype, transport="nccl") + ) + dag = receiver.recv_dict.bind(dag) + + compiled_dag = dag.experimental_compile() + + for i in range(3): + args = (shape, dtype, 1) + + output_channel = compiled_dag.execute(args) + # TODO(swang): Replace with fake ObjectRef. + result = output_channel.begin_read() + expected_result = {0: (0, shape, dtype)} + assert result == expected_result + output_channel.end_read() + + compiled_dag.teardown() + + +@pytest.mark.parametrize("ray_start_regular", [{"num_cpus": 4}], indirect=True) +def test_torch_tensor_nccl_nested_dynamic(ray_start_regular): + """ + Test nested torch.Tensor passed via NCCL. Its shape and dtype is + dynamically declared, and there may be multiple tensors. + """ + if not USE_GPU: + pytest.skip("NCCL tests require GPUs") + + assert ( + sum(node["Resources"].get("GPU", 0) for node in ray.nodes()) > 1 + ), "This test requires at least 2 GPUs" + + actor_cls = TorchTensorWorker.options(num_gpus=1) + + sender = actor_cls.remote() + receiver = actor_cls.remote() + + with InputNode() as inp: + dag = sender.send_dict_with_tuple_args.bind(inp) + dag = dag.with_type_hint(TorchTensorType(transport="nccl")) + dag = receiver.recv_dict.bind(dag) + + compiled_dag = dag.experimental_compile() + + for i in range(3): + i += 1 + + shape = (10 * i,) + dtype = torch.float16 + args = (shape, dtype, i) + + output_channel = compiled_dag.execute(args) + # TODO(swang): Replace with fake ObjectRef. + result = output_channel.begin_read() + expected_result = {j: (j, shape, dtype) for j in range(i)} + assert result == expected_result + output_channel.end_read() + + compiled_dag.teardown() + + +@pytest.mark.parametrize("ray_start_regular", [{"num_cpus": 4}], indirect=True) +def test_torch_tensor_nccl_direct_return_error(ray_start_regular): + if not USE_GPU: + pytest.skip("NCCL tests require GPUs") + + assert ( + sum(node["Resources"].get("GPU", 0) for node in ray.nodes()) > 1 + ), "This test requires at least 2 GPUs" + + actor_cls = TorchTensorWorker.options(num_cpus=0, num_gpus=1) + + sender = actor_cls.remote() + receiver = actor_cls.remote() + + shape = (10,) + dtype = torch.float16 + + # Passing a non-tensor value when direct_return=True and tranport="nccl" + # fails. + with InputNode() as inp: + dag = sender.send_dict_with_tuple_args.bind(inp) + dag = dag.with_type_hint( + TorchTensorType( + transport=TorchTensorType.NCCL, + direct_return=True, + ) + ) + dag = receiver.recv_dict.bind(dag) + + compiled_dag = dag.experimental_compile() + + output_channel = compiled_dag.execute((shape, dtype, 1)) + with pytest.raises(OSError): + output_channel.begin_read() + + compiled_dag.teardown() + + # TODO(swang): This currently requires time.sleep to avoid some issue with + # following tests. + time.sleep(3) if __name__ == "__main__": diff --git a/python/ray/experimental/channel/common.py b/python/ray/experimental/channel/common.py index b0f173ca7369..4ec632db4132 100644 --- a/python/ray/experimental/channel/common.py +++ b/python/ray/experimental/channel/common.py @@ -1,5 +1,6 @@ import asyncio import concurrent +import copy import threading from dataclasses import dataclass from typing import Any, Dict, List, Optional @@ -16,6 +17,9 @@ @PublicAPI(stability="alpha") class ChannelOutputType: + def __init__(self): + self._contains_type: Optional["ChannelOutputType"] = None + def register_custom_serializer(self) -> None: """ Register any custom serializers needed to pass data of this type. This @@ -29,7 +33,41 @@ def register_custom_serializer(self) -> None: default device. Instead, these should be extracted from the worker-local _SerializationContext. """ - pass + if self._contains_type is not None: + self._contains_type.register_custom_serializer() + + @property + def is_direct_return(self) -> bool: + """ + Some channels may contain other values that should be sent via a + different channel. This returns whether the value is a direct return or + if it is "nested" inside a different channel. + """ + return True + + @property + def contains_type(self) -> "ChannelOutputType": + """ + Some channel values may contain an object that should be sent through a + different channel. For example, a Python object containing a GPU tensor + may be sent over two channels, one to serialize the Python data on CPU + memory and another to transfer the GPU data over NCCL. This function + returns the type of this nested value, if any. + """ + return self._contains_type + + def set_contains_type(self, typ: "ChannelOutputType") -> None: + """ + Mark that values sent on this channel may contain objects that should + be sent through a different channel. + """ + from ray.experimental.channel.torch_tensor_type import TorchTensorType + + if typ is not None: + assert isinstance( + typ, TorchTensorType + ), "Contained type must be of type TorchTensorType" + self._contains_type = copy.deepcopy(typ) def create_channel( self, @@ -51,6 +89,10 @@ def create_channel( raise NotImplementedError def requires_nccl(self) -> bool: + if self._contains_type is not None: + if self._contains_type.requires_nccl(): + return True + # By default, channels do not require NCCL. return False diff --git a/python/ray/experimental/channel/serialization_context.py b/python/ray/experimental/channel/serialization_context.py index 2a70e85b328b..4ad13db74722 100644 --- a/python/ray/experimental/channel/serialization_context.py +++ b/python/ray/experimental/channel/serialization_context.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, List, Optional, Union if TYPE_CHECKING: import numpy as np @@ -9,11 +9,32 @@ class _SerializationContext: def __init__(self): self.torch_device: Optional["torch.device"] = None self.use_external_transport: bool = False + self.tensors: List["torch.Tensor"] = [] + + def set_use_external_transport(self, use_external_transport: bool) -> None: + self.use_external_transport = use_external_transport def set_torch_device(self, torch_device: "torch.device") -> None: self.torch_device = torch_device - def serialize_tensor(self, tensor: "torch.Tensor") -> "np.ndarray": + def reset_tensors(self, tensors: List["torch.Tensor"]) -> List["torch.Tensor"]: + prev_tensors = self.tensors + self.tensors = tensors + return prev_tensors + + def serialize_tensor(self, tensor: "torch.Tensor") -> Union[int, "np.ndarray"]: + if self.use_external_transport and tensor.device == self.torch_device: + # External transport is enabled and we found a tensor that matches + # our device. Add the actual tensor to a buffer. The buffer of + # tensors should later be popped by the caller and sent via + # external transport. + self.tensors.append(tensor) + # Return a placeholder. + return len(self.tensors) - 1 + + return self.serialize_to_numpy(tensor) + + def serialize_to_numpy(self, tensor: "torch.Tensor") -> "np.ndarray": # Transfer through Ray's shared memory store for now. # TODO(swang): This requires two copies, one to transfer from GPU to # CPU and another from CPU to shared memory. Ideally we should elide @@ -24,12 +45,20 @@ def serialize_tensor(self, tensor: "torch.Tensor") -> "np.ndarray": return tensor.numpy() - def deserialize_tensor(self, np_array: "np.ndarray"): + def deserialize_tensor(self, val: Union["np.ndarray", int]): + # Found a placeholder for a tensor that was serialized via NCCL. + # Replace it with the corresponding deserialized tensor. + if isinstance(val, int): + return self.tensors[val] + + return self.deserialize_from_numpy(val) + + def deserialize_from_numpy(self, np_array: "np.ndarray"): import torch # TODO(swang): Support local P2P transfers if available. # If there is a GPU assigned to this worker, move it there. - if self.torch_device.type == "cuda": + if self.torch_device is not None and self.torch_device.type == "cuda": # Use zero-copy from_numpy() because we are going to copy to GPU # anyway. # TODO: Pin the np_array memory to reduce data movement time. diff --git a/python/ray/experimental/channel/shared_memory_channel.py b/python/ray/experimental/channel/shared_memory_channel.py index 94c93c35bd56..b99c7bd5fd4c 100644 --- a/python/ray/experimental/channel/shared_memory_channel.py +++ b/python/ray/experimental/channel/shared_memory_channel.py @@ -3,7 +3,9 @@ from typing import Any, List, Optional, Union import ray +from ray._raylet import SerializedObject from ray.experimental.channel.common import ChannelInterface, ChannelOutputType +from ray.experimental.channel.torch_tensor_type import TorchTensorType from ray.util.annotations import PublicAPI # Logger for this module. It should be configured at the entry point @@ -106,8 +108,35 @@ def create_channel( A ChannelInterface that can be used to pass data of this type. """ + if self._contains_type is not None: + assert isinstance( + self._contains_type, TorchTensorType + ), "_contains_type must be of type TorchTensorType" + + from ray.experimental.channel.torch_tensor_nccl_channel import ( + NestedTorchTensorNcclChannel, + ) + + if self._contains_type.requires_nccl(): + cpu_data_typ = SharedMemoryType( + buffer_size_bytes=self.buffer_size_bytes + ) + return NestedTorchTensorNcclChannel( + writer, + readers, + gpu_data_typ=self._contains_type, + cpu_data_typ=cpu_data_typ, + ) + return Channel(writer, readers) + def set_nccl_group_id(self, group_id: str) -> None: + assert self.requires_nccl() + + # Shared memory channels don't need NCCL but they can + # contain objects that use NCCL. + self._contains_type.set_nccl_group_id(group_id) + @PublicAPI(stability="alpha") class Channel(ChannelInterface): @@ -311,17 +340,22 @@ def __reduce__(self): def write(self, value: Any): self.ensure_registered_as_writer() - try: - serialized_value = self._worker.get_serialization_context().serialize(value) - except TypeError as e: - sio = io.StringIO() - ray.util.inspect_serializability(value, print_file=sio) - msg = ( - "Could not serialize the put value " - f"{repr(value)}:\n" - f"{sio.getvalue()}" - ) - raise TypeError(msg) from e + if not isinstance(value, SerializedObject): + try: + serialized_value = self._worker.get_serialization_context().serialize( + value + ) + except TypeError as e: + sio = io.StringIO() + ray.util.inspect_serializability(value, print_file=sio) + msg = ( + "Could not serialize the put value " + f"{repr(value)}:\n" + f"{sio.getvalue()}" + ) + raise TypeError(msg) from e + else: + serialized_value = value self._worker.core_worker.experimental_channel_put_serialized( serialized_value, diff --git a/python/ray/experimental/channel/torch_tensor_nccl_channel.py b/python/ray/experimental/channel/torch_tensor_nccl_channel.py index b4764cc6514b..19020cad0612 100644 --- a/python/ray/experimental/channel/torch_tensor_nccl_channel.py +++ b/python/ray/experimental/channel/torch_tensor_nccl_channel.py @@ -1,7 +1,8 @@ +import io import logging import uuid from types import ModuleType -from typing import TYPE_CHECKING, List, Optional +from typing import TYPE_CHECKING, Any, List, Optional, Union import ray import ray.util.serialization @@ -25,6 +26,151 @@ logger = logging.getLogger(__name__) +class NestedTorchTensorNcclChannel(ChannelInterface): + def __init__( + self, + writer: ray.actor.ActorHandle, + readers: List[ray.actor.ActorHandle], + gpu_data_typ: "TorchTensorType", + cpu_data_typ: Optional["SharedMemoryType"] = None, + _gpu_data_channel: Optional["TorchTensorNcclChannel"] = None, + _cpu_data_channel: Optional["Channel"] = None, + ): + """ + Can be used to send GPU tensors nested inside other data. The data is + sent via shared memory while the GPU tensors are sent through a P2P + transport (NCCL). + + NOTE: This class is currently not thread-safe because it reads and + writes the worker-local + ray.experimental.channel.serialization_context._SerializationContext + when serializing data. + """ + self._writer = writer + self._readers = readers + + if _gpu_data_channel is not None or _cpu_data_channel is not None: + # This path is used when the NestedTorchTensorNcclChannel is being + # deserialized. + assert ( + writer is None + and readers is None + and gpu_data_typ is None + and cpu_data_typ is None + ) + assert _gpu_data_channel is not None and _cpu_data_channel is not None + self._gpu_data_channel = _gpu_data_channel + self._cpu_data_channel = _cpu_data_channel + else: + # This path is used when the NestedTorchTensorNcclChannel is first + # being created, by the writer of the channel. + self._gpu_data_channel: TorchTensorNcclChannel = ( + gpu_data_typ.create_channel(writer, readers) + ) + self._cpu_data_channel: Optional["Channel"] = None + if cpu_data_typ is not None: + self._cpu_data_channel = cpu_data_typ.create_channel(writer, readers) + + # Used for serialization. + self._worker = ray._private.worker.global_worker + self._worker.check_connected() + + ctx = ChannelContext.get_current() + self.serialization_ctx = ctx.serialization_context + assert self.serialization_ctx is not None + + @classmethod + def from_channels( + cls, + gpu_data_channel: "TorchTensorNcclChannel", + cpu_data_channel: Optional["Channel"], + ): + return cls( + writer=None, + readers=None, + gpu_data_typ=None, + cpu_data_typ=None, + _gpu_data_channel=gpu_data_channel, + _cpu_data_channel=cpu_data_channel, + ) + + def __reduce__(self): + return ( + NestedTorchTensorNcclChannel.from_channels, + (self._gpu_data_channel, self._cpu_data_channel), + ) + + def ensure_registered_as_writer(self): + self._gpu_data_channel.ensure_registered_as_writer() + if self._cpu_data_channel is not None: + self._cpu_data_channel.ensure_registered_as_writer() + + def ensure_registered_as_reader(self): + self._gpu_data_channel.ensure_registered_as_reader() + if self._cpu_data_channel is not None: + self._cpu_data_channel.ensure_registered_as_reader() + + def write(self, value: Any): + self.serialization_ctx.reset_tensors([]) + # All tensors found in `value` will be transferred via NCCL. + self.serialization_ctx.set_use_external_transport(True) + + try: + # Serialize the data. All tensors that match our current device + # will be extracted into the serialization context and replaced + # with a placeholder. + serialized_cpu_data = self._worker.get_serialization_context().serialize( + value + ) + except TypeError as e: + sio = io.StringIO() + ray.util.inspect_serializability(value, print_file=sio) + msg = ( + "Could not serialize the put value " + f"{repr(value)}:\n" + f"{sio.getvalue()}" + ) + raise TypeError(msg) from e + finally: + # Pop the tensors that were found during serialization of `value`. + tensors_to_send = self.serialization_ctx.reset_tensors([]) + # Reset the serialization method to now serialize torch.Tensors + # normally. + self.serialization_ctx.set_use_external_transport(False) + + # Send the extracted tensors through a GPU-specific channel. + self._gpu_data_channel.write(tensors_to_send) + # Send the rest of the data, with placeholders for the extracted + # tensors, through a CPU-specific channel. + self._cpu_data_channel.write(serialized_cpu_data) + + def begin_read(self) -> Any: + tensors = self._gpu_data_channel.begin_read() + + if self._gpu_data_channel.has_static_type(): + # If the channel was declared with a static TorchTensorType, then + # the task is allowed to return at most one tensor, and its shape + # and dtype must match the declared type. Wrap the tensor in a + # list since the following calls expect a list. + tensors = [tensors] + + self.serialization_ctx.reset_tensors(tensors) + data = self._cpu_data_channel.begin_read() + self.serialization_ctx.reset_tensors([]) + + return data + + def end_read(self) -> None: + self._gpu_data_channel.end_read() + if self._cpu_data_channel: + self._cpu_data_channel.end_read() + + def close(self) -> None: + self._gpu_data_channel.close() + if self._cpu_data_channel is not None: + self._cpu_data_channel.close() + + @DeveloperAPI class TorchTensorNcclChannel(ChannelInterface): def __init__( @@ -150,16 +296,35 @@ def _get_tensor_meta(self, tensor: "torch.Tensor") -> Optional["TorchTensorType" def write( self, - tensor: "torch.Tensor", + tensors: Union["torch.Tensor", List["torch.Tensor"], Exception], ): - if isinstance(tensor, ray.exceptions.RayTaskError): + if isinstance(tensors, ray.exceptions.RayTaskError): # TODO(swang): Write exceptions to the meta channel if it is # available. - raise tensor - - meta = self._get_tensor_meta(tensor) - if meta is not None: - self._meta_channel.write(meta) + raise tensors + + if isinstance(tensors, list): + meta_list = [] + for tensor in tensors: + meta_list.append(self._get_tensor_meta(tensor)) + if self.has_static_type(): + # Make sure that there is exactly one tensor to send, and its + # metadata should have matched the static type. + if meta_list != [None]: + raise ValueError( + "DAGNode annotated with " + "TorchTensorType(shape=shape, dtype=dtype))` can return at " + "most one tensor with the declared `shape` and `dtype`. " + "Use TorchTensorType() if value contains more than one " + "tensor or tensor of dynamic size." + ) + else: + self._meta_channel.write(meta_list) + else: + meta = self._get_tensor_meta(tensors) + if meta is not None: + self._meta_channel.write(meta) + tensors = [tensors] # NOTE(swang): We must send the metadata *before* launching the NCCL # send. We are using blocking NCCL ops, so the following calls will @@ -167,23 +332,35 @@ def write( # kernel together before either can proceed. Therefore, we send the # metadata first so that the receiver can read the metadata and then # launch the same NCCL op. - # TODO: If there are multiple readers, can replace with a - # broadcast. - for rank in self._reader_ranks: - self._nccl_group.send(tensor, rank) + for tensor in tensors: + # TODO: If there are multiple readers, can replace with a + # broadcast. + for rank in self._reader_ranks: + self._nccl_group.send(tensor, rank) + + def _begin_read_single_tensor(self, typ: "TorchTensorType") -> "torch.Tensor": + buf = self.torch.zeros(typ.shape, dtype=typ.dtype, device=self._device) + self._nccl_group.recv(buf, self._writer_rank) + return buf - def begin_read(self) -> "torch.Tensor": + def begin_read(self) -> Union["torch.Tensor", List["torch.Tensor"]]: if self._meta_channel is not None: - typ = self._meta_channel.begin_read() + meta = self._meta_channel.begin_read() # It's safe to release the channel because shape and dtype should get # copied during deserialization. self._meta_channel.end_read() else: - typ = self._typ + meta = self._typ - buf = self.torch.zeros(typ.shape, dtype=typ.dtype, device=self._device) - self._nccl_group.recv(buf, self._writer_rank) - return buf + if not isinstance(meta, list): + return self._begin_read_single_tensor(meta) + + bufs: List["torch.Tensor"] = [] + for typ in meta: + bufs.append(self._begin_read_single_tensor(typ)) + # TODO: Sync CUDA stream after receiving all tensors, instead of after + # each tensor. + return bufs def end_read(self) -> None: return diff --git a/python/ray/experimental/channel/torch_tensor_type.py b/python/ray/experimental/channel/torch_tensor_type.py index 17ead075c85f..7c987f108311 100644 --- a/python/ray/experimental/channel/torch_tensor_type.py +++ b/python/ray/experimental/channel/torch_tensor_type.py @@ -1,3 +1,4 @@ +import logging from typing import TYPE_CHECKING, List, Optional, Tuple, Union import ray @@ -7,6 +8,8 @@ if TYPE_CHECKING: import torch +logger = logging.getLogger(__name__) + # 100KB to store metadata and/or exceptions. # NOTE(swang): This will consume memory but it should not affect performance # because we only copy the actual data stored, not the maximum size of the @@ -36,7 +39,8 @@ def __init__( self, shape: Union[int, Tuple[int], str] = AUTO, dtype: "torch.dtype" = AUTO, - transport: Optional[str] = None, + transport: Optional[str] = AUTO, + direct_return: Optional[bool] = False, ): """ A type hint that can be used to annotate DAG nodes that return a @@ -55,13 +59,19 @@ def __init__( maximum size of the tensor. If a DAG node's returned serialized tensor exceeds this size, the task will error. For tensors passed via NCCL, the returned tensor must *match* the given - shape; if it does not match, the task will error. + shape; if it does not match, the task will error. Specifying + the shape and dtype ahead of time will eliminate the + performance overhead from an additional metadata transfer. dtype: The expected dtype of the torch.Tensor. Similar to the shape, this may be statically or dynamically declared. transport: "auto" (default) means that tensors will be passed via host memory, using numpy as the serialization format. Pass TorchTensorType.NCCL or "nccl" to use NCCL instead, avoiding the host memory copy. + direct_return: Whether the tensor is sent directly or inside of + other data. If a non-default `transport` is used, this allows + the sender and receiver to eliminate performance overhead from + an additional data transfer. """ super().__init__() @@ -72,10 +82,29 @@ def __init__( self.shape = shape self.dtype = dtype + self.direct_return = direct_return + + if transport not in [self.AUTO, self.NCCL]: + raise ValueError( + "`transport` must be TorchTensorType.AUTO or TorchTensorType.NCCL" + ) self.transport = transport + self._nccl_group_id: Optional[str] = None + if self.direct_return and self.transport == self.AUTO: + logger.info( + "TorchTensorType(direct_return=True) has no effect when " + "`transport` is TorchTensorType.AUTO (default)." + ) + + @property + def is_direct_return(self) -> bool: + return self.direct_return + def register_custom_serializer(self) -> None: + super().register_custom_serializer() + import torch default_device = _get_default_torch_device() @@ -96,6 +125,9 @@ def deserialize(b): deserializer=deserialize, ) + def set_contains_type(self, typ: "ChannelOutputType") -> None: + raise ValueError("TorchTensorType cannot contain other types") + def create_channel( self, writer: Optional["ray.actor.ActorHandle"], From a77726912b8094e65d1dfe48b9da1c7ce1a92ca2 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Sat, 25 May 2024 23:10:26 -0700 Subject: [PATCH 19/65] [Data] Upgrade Arrow version to 16 in CI (#45565) The most recent stable version of Arrow is 16. So, this PR bumps the version in CI from 15 to 16. Signed-off-by: Balaji Veeramani --- .buildkite/core.rayci.yml | 4 ++-- .buildkite/data.rayci.yml | 24 +++++++++---------- ...ild.wanda.yaml => data16.build.wanda.yaml} | 6 ++--- 3 files changed, 17 insertions(+), 17 deletions(-) rename ci/docker/{data15.build.wanda.yaml => data16.build.wanda.yaml} (83%) diff --git a/.buildkite/core.rayci.yml b/.buildkite/core.rayci.yml index 25a8ba72a287..0c0bb5921e75 100644 --- a/.buildkite/core.rayci.yml +++ b/.buildkite/core.rayci.yml @@ -142,10 +142,10 @@ steps: commands: - bazel run //ci/ray_ci:test_in_docker -- python/ray/util/dask/... python/ray/tests/modin/... core - --build-name data15build + --build-name data16build --parallelism-per-worker 2 depends_on: - - data15build + - data16build - forge - label: ":ray: core: dashboard tests" diff --git a/.buildkite/data.rayci.yml b/.buildkite/data.rayci.yml index 7c5b528dc9f6..0d1b2ac5e28c 100644 --- a/.buildkite/data.rayci.yml +++ b/.buildkite/data.rayci.yml @@ -7,8 +7,8 @@ steps: - name: data6build wanda: ci/docker/data6.build.wanda.yaml - - name: data15build - wanda: ci/docker/data15.build.wanda.yaml + - name: data16build + wanda: ci/docker/data16.build.wanda.yaml - name: databuild-multipy label: "wanda: databuild-py{{matrix}}" @@ -42,7 +42,7 @@ steps: --except-tags data_integration,doctest depends_on: data6build - - label: ":database: data: arrow 15 tests" + - label: ":database: data: arrow 16 tests" tags: - python - data @@ -52,9 +52,9 @@ steps: - bazel run //ci/ray_ci:test_in_docker -- //python/ray/data/... //python/ray/air/... data --workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3 - --build-name data15build + --build-name data16build --except-tags data_integration,doctest - depends_on: data15build + depends_on: data16build - label: ":database: data: arrow nightly tests" tags: @@ -91,17 +91,17 @@ steps: commands: # doc tests - bazel run //ci/ray_ci:test_in_docker -- python/ray/... //doc/... data - --build-name data15build + --build-name data16build --except-tags gpu --only-tags doctest --parallelism-per-worker 2 # doc examples - bazel run //ci/ray_ci:test_in_docker -- //doc/... data - --build-name data15build + --build-name data16build --except-tags gpu,post_wheel_build,doctest --parallelism-per-worker 2 --skip-ray-installation - depends_on: data15build + depends_on: data16build - label: ":database: data: doc gpu tests" tags: @@ -144,9 +144,9 @@ steps: instance_type: small commands: - bazel run //ci/ray_ci:test_in_docker -- python/ray/dashboard/... data - --build-name data15build + --build-name data16build --parallelism-per-worker 3 - depends_on: data15build + depends_on: data16build - label: ":database: data: flaky tests" tags: @@ -158,9 +158,9 @@ steps: commands: - bazel run //ci/ray_ci:test_in_docker -- //... data --run-flaky-tests --parallelism-per-worker 3 - --build-name data15build + --build-name data16build --except-tags gpu_only,gpu - depends_on: data15build + depends_on: data16build - label: ":database: data: flaky gpu tests" tags: diff --git a/ci/docker/data15.build.wanda.yaml b/ci/docker/data16.build.wanda.yaml similarity index 83% rename from ci/docker/data15.build.wanda.yaml rename to ci/docker/data16.build.wanda.yaml index 20a9b459d0ee..6a9af8b99d99 100644 --- a/ci/docker/data15.build.wanda.yaml +++ b/ci/docker/data16.build.wanda.yaml @@ -1,4 +1,4 @@ -name: "data15build" +name: "data16build" froms: ["cr.ray.io/rayproject/oss-ci-base_ml"] dockerfile: ci/docker/data.build.Dockerfile srcs: @@ -10,6 +10,6 @@ srcs: - python/requirements/ml/data-requirements.txt - python/requirements/ml/data-test-requirements.txt build_args: - - ARROW_VERSION=15.* + - ARROW_VERSION=16.* tags: - - cr.ray.io/rayproject/data15build + - cr.ray.io/rayproject/data16build From 15c294ed6ca3bfd8dcda8f958c354abc9c28295f Mon Sep 17 00:00:00 2001 From: Cuong Nguyen <128072568+can-anyscale@users.noreply.github.com> Date: Sun, 26 May 2024 07:39:59 -0700 Subject: [PATCH 20/65] [ci][microcheck/13] reuse the logic to determine microcheck tests (#45504) Currently the logic to determine the list of microcheck tests and their rayci step ids are diverging. This PR move more function into Test class, so we converge more these two logic. This make sure that we will trigger the right buildkite steps to cover most if not all microcheck tests. Test: - CI --------- Signed-off-by: can --- .buildkite/data.rayci.yml | 2 +- .../determine_microcheck_step_ids.py | 9 +- ci/ray_ci/test_tester.py | 70 +------------- ci/ray_ci/tester.py | 38 ++------ release/ray_release/test.py | 61 +++++++++--- release/ray_release/tests/test_test.py | 93 +++++++++++++++---- 6 files changed, 146 insertions(+), 127 deletions(-) diff --git a/.buildkite/data.rayci.yml b/.buildkite/data.rayci.yml index 0d1b2ac5e28c..4b1a0defe788 100644 --- a/.buildkite/data.rayci.yml +++ b/.buildkite/data.rayci.yml @@ -57,10 +57,10 @@ steps: depends_on: data16build - label: ":database: data: arrow nightly tests" + if: pipeline.id == "0189e759-8c96-4302-b6b5-b4274406bf89" || pipeline.id == "018f4f1e-1b73-4906-9802-92422e3badaa" tags: - python - data - - skip-on-premerge instance_type: medium parallelism: 2 commands: diff --git a/ci/ray_ci/automation/determine_microcheck_step_ids.py b/ci/ray_ci/automation/determine_microcheck_step_ids.py index d4ca520e5161..4d15da5f8e6e 100644 --- a/ci/ray_ci/automation/determine_microcheck_step_ids.py +++ b/ci/ray_ci/automation/determine_microcheck_step_ids.py @@ -1,4 +1,5 @@ import click +import os from ci.ray_ci.utils import ci_init from ray_release.test import ( @@ -8,6 +9,8 @@ MACOS_TEST_PREFIX, ) +BAZEL_WORKSPACE_DIR = os.environ.get("BUILD_WORKSPACE_DIRECTORY", "") + @click.command() def main() -> None: @@ -16,9 +19,9 @@ def main() -> None: """ ci_init() steps = ( - list(Test.gen_high_impact_tests(LINUX_TEST_PREFIX).keys()) - + list(Test.gen_high_impact_tests(WINDOWS_TEST_PREFIX).keys()) - + list(Test.gen_high_impact_tests(MACOS_TEST_PREFIX).keys()) + Test.gen_microcheck_step_ids(LINUX_TEST_PREFIX, BAZEL_WORKSPACE_DIR) + .union(Test.gen_microcheck_step_ids(WINDOWS_TEST_PREFIX, BAZEL_WORKSPACE_DIR)) + .union(Test.gen_microcheck_step_ids(MACOS_TEST_PREFIX, BAZEL_WORKSPACE_DIR)) ) print(",".join(steps)) diff --git a/ci/ray_ci/test_tester.py b/ci/ray_ci/test_tester.py index 86bbee6b859f..1a2157dee889 100644 --- a/ci/ray_ci/test_tester.py +++ b/ci/ray_ci/test_tester.py @@ -13,10 +13,9 @@ _get_container, _get_all_test_query, _get_test_targets, - _get_high_impact_test_targets, + _get_new_tests, _get_flaky_test_targets, _get_tag_matcher, - _get_new_tests, ) from ray_release.test import Test, TestState @@ -121,15 +120,12 @@ def test_get_test_targets() -> None: ), mock.patch( "ray_release.test.Test.gen_from_s3", return_value=test_objects, - ), mock.patch( - "ray_release.test.Test.gen_high_impact_tests", - return_value={"step": test_objects}, - ), mock.patch( - "ray_release.test.Test.get_changed_tests", - return_value=set(), ), mock.patch( "ci.ray_ci.tester._get_new_tests", return_value=set(), + ), mock.patch( + "ray_release.test.Test.gen_microcheck_tests", + return_value={test.get_target() for test in test_objects}, ): assert set( _get_test_targets( @@ -211,64 +207,6 @@ def test_get_all_test_query() -> None: ) -def test_get_high_impact_test_targets() -> None: - test_harness = [ - { - "input": [], - "new_tests": set(), - "changed_tests": set(), - "human_tests": set(), - "output": set(), - }, - { - "input": [ - _stub_test( - { - "name": "linux://core_good", - "team": "core", - } - ), - _stub_test( - { - "name": "linux://serve_good", - "team": "serve", - } - ), - ], - "new_tests": {"//core_new"}, - "changed_tests": {"//core_new"}, - "human_tests": {"//human_test"}, - "output": { - "//core_good", - "//core_new", - "//human_test", - }, - }, - ] - for test in test_harness: - with mock.patch( - "ray_release.test.Test.gen_high_impact_tests", - return_value={"step": test["input"]}, - ), mock.patch( - "ci.ray_ci.tester._get_new_tests", - return_value=test["new_tests"], - ), mock.patch( - "ray_release.test.Test.get_changed_tests", - return_value=test["changed_tests"], - ), mock.patch( - "ray_release.test.Test.get_human_specified_tests", - return_value=test["human_tests"], - ): - assert ( - _get_high_impact_test_targets( - "core", - "linux", - LinuxTesterContainer("test", skip_ray_installation=True), - ) - == test["output"] - ) - - @mock.patch("ci.ray_ci.tester_container.TesterContainer.run_script_with_output") @mock.patch("ray_release.test.Test.gen_from_s3") def test_get_new_tests(mock_gen_from_s3, mock_run_script_with_output) -> None: diff --git a/ci/ray_ci/tester.py b/ci/ray_ci/tester.py index ec2967689398..06a1f312808b 100644 --- a/ci/ray_ci/tester.py +++ b/ci/ray_ci/tester.py @@ -1,4 +1,3 @@ -import itertools import os import sys from typing import List, Set, Tuple, Optional @@ -37,7 +36,6 @@ """ # noqa: E501 DEFAULT_EXCEPT_TAGS = {"manual"} -MICROCHECK_COMMAND = "@microcheck" # Gets the path of product/tools/docker (i.e. the parent of 'common') bazel_workspace_dir = os.environ.get("BUILD_WORKSPACE_DIRECTORY", "") @@ -394,38 +392,20 @@ def _get_test_targets( if get_high_impact_tests: # run high impact test cases, so we include only high impact tests in the list # of targets provided by users - high_impact_tests = _get_high_impact_test_targets( - team, operating_system, container - ) + prefix = f"{operating_system}:" + # TODO(can): we should also move the logic of _get_new_tests into the + # gen_microcheck_tests function; this is currently blocked by the fact that + # we need a container to run _get_new_tests + high_impact_tests = Test.gen_microcheck_tests( + prefix=prefix, + bazel_workspace_dir=bazel_workspace_dir, + team=team, + ).union(_get_new_tests(prefix, container)) final_targets = high_impact_tests.intersection(final_targets) return list(final_targets) -def _get_high_impact_test_targets( - team: str, operating_system: str, container: TesterContainer -) -> Set[str]: - """ - Get all test targets that are high impact - """ - os_prefix = f"{operating_system}:" - step_id_to_tests = Test.gen_high_impact_tests(prefix=os_prefix) - high_impact_tests = { - test.get_name().lstrip(os_prefix) - for test in itertools.chain.from_iterable(step_id_to_tests.values()) - if test.get_oncall() == team - } - new_tests = _get_new_tests(os_prefix, container) - changed_tests = Test.get_changed_tests(bazel_workspace_dir) - human_specified_tests = Test.get_human_specified_tests(bazel_workspace_dir) - - return ( - high_impact_tests.union(new_tests) - .union(changed_tests) - .union(human_specified_tests) - ) - - def _get_new_tests(prefix: str, container: TesterContainer) -> Set[str]: """ Get all local test targets that are not in database diff --git a/release/ray_release/test.py b/release/ray_release/test.py index fe1df459fd90..6341053930a7 100644 --- a/release/ray_release/test.py +++ b/release/ray_release/test.py @@ -198,24 +198,63 @@ def gen_from_s3(cls, prefix: str): ] @classmethod - def gen_high_impact_tests(cls, prefix: str) -> Dict[str, List]: + def gen_microcheck_step_ids(cls, prefix: str, bazel_workspace_dir: str) -> Set[str]: """ - Obtain the mapping from rayci step id to high impact tests with the given prefix + This function is used to get the buildkite step ids of the microcheck tests + with the given test prefix. This is used to determine the buildkite steps in + the microcheck pipeline. + """ + step_ids = set() + test_targets = cls.gen_microcheck_tests(prefix, bazel_workspace_dir) + for test_target in test_targets: + test = cls.gen_from_name(f"{prefix}{test_target}") + if not test: + continue + recent_results = test.get_test_results() + if not recent_results: + continue + test_step_ids = { + result.rayci_step_id + for result in recent_results + if result.commit == recent_results[0].commit and result.rayci_step_id + } + if test_step_ids and not step_ids.intersection(test_step_ids): + step_ids.add(sorted(test_step_ids)[0]) + + return step_ids + + @classmethod + def gen_microcheck_tests( + cls, prefix: str, bazel_workspace_dir: str, team: Optional[str] = None + ) -> Set[str]: + """ + Obtain all microcheck tests with the given prefix + """ + high_impact_tests = Test._gen_high_impact_tests(prefix, team) + changed_tests = Test._get_changed_tests(bazel_workspace_dir) + human_specified_tests = Test._get_human_specified_tests(bazel_workspace_dir) + + return high_impact_tests.union(changed_tests, human_specified_tests) + + @classmethod + def _gen_high_impact_tests( + cls, prefix: str, team: Optional[str] = None + ) -> Set[str]: + """ + Obtain all high impact tests with the given prefix """ high_impact_tests = [ test for test in cls.gen_from_s3(prefix) if test.is_high_impact() ] - step_id_to_tests = {} - for test in high_impact_tests: - step_id = test.get_test_results(limit=1)[0].rayci_step_id - if not step_id: - continue - step_id_to_tests[step_id] = step_id_to_tests.get(step_id, []) + [test] + if team: + high_impact_tests = [ + test for test in high_impact_tests if test.get_oncall() == team + ] - return step_id_to_tests + return {test.get_target() for test in high_impact_tests} @classmethod - def get_human_specified_tests(cls, bazel_workspace_dir: str) -> Set[str]: + def _get_human_specified_tests(cls, bazel_workspace_dir: str) -> Set[str]: """ Get all test targets that are specified by humans """ @@ -238,7 +277,7 @@ def get_human_specified_tests(cls, bazel_workspace_dir: str) -> Set[str]: return tests @classmethod - def get_changed_tests(cls, bazel_workspace_dir: str) -> Set[str]: + def _get_changed_tests(cls, bazel_workspace_dir: str) -> Set[str]: """ Get all changed tests in the current PR """ diff --git a/release/ray_release/tests/test_test.py b/release/ray_release/tests/test_test.py index 2513eb1ea401..1a662dc68b71 100644 --- a/release/ray_release/tests/test_test.py +++ b/release/ray_release/tests/test_test.py @@ -61,11 +61,11 @@ def _stub_test(val: dict) -> Test: def _stub_test_result( - status: ResultStatus = ResultStatus.SUCCESS, rayci_step_id="123" + status: ResultStatus = ResultStatus.SUCCESS, rayci_step_id="123", commit="456" ) -> TestResult: return TestResult( status=status.value, - commit="1234567890", + commit=commit, branch="master", url="url", timestamp=0, @@ -333,41 +333,45 @@ def _mock_gen_test_result( ] -@patch("ray_release.test.Test.gen_from_s3") -def gen_high_impact_tests(mock_gen_from_s3) -> None: +@patch("ray_release.test.Test.gen_microcheck_test") +@patch("ray_release.test.Test.gen_from_name") +def gen_microcheck_step_ids(mock_gen_from_name, mock_gen_microcheck_test) -> None: core_test = MockTest( { - "name": "core_test", + "name": "linux://core_test", Test.KEY_IS_HIGH_IMPACT: "false", "test_results": [ - _stub_test_result(rayci_step_id="corebuild"), + _stub_test_result(rayci_step_id="corebuild", commit="123"), ], } ) data_test_01 = MockTest( { - "name": "data_test_01", + "name": "linux://data_test_01", Test.KEY_IS_HIGH_IMPACT: "true", "test_results": [ - _stub_test_result(rayci_step_id="databuild"), + _stub_test_result(rayci_step_id="databuild", commit="123"), ], } ) data_test_02 = MockTest( { - "name": "data_test_02", + "name": "linux://data_test_02", Test.KEY_IS_HIGH_IMPACT: "true", "test_results": [ - _stub_test_result(rayci_step_id="databuild"), + _stub_test_result(rayci_step_id="data15build", commit="123"), + _stub_test_result(rayci_step_id="databuild", commit="123"), + _stub_test_result(rayci_step_id="databuild", commit="456"), ], } ) + all_tests = [core_test, data_test_01, data_test_02] + mock_gen_microcheck_test.return_value = [test.get_target() for test in all_tests] + mock_gen_from_name.side_effect = lambda x: [ + test for test in all_tests if test.get_name() == x + ][0] - mock_gen_from_s3.return_value = [core_test, data_test_01, data_test_02] - - assert Test.gen_high_impact_tests("linux") == { - "databuild": [data_test_01, data_test_02] - } + assert Test.gen_microcheck_step_ids("linux", "") == {"databuild"} def test_get_test_target(): @@ -402,7 +406,7 @@ def test_get_changed_tests( lambda x, _: {"//t1", "//t2"} if x == "test_src" else {} ) - assert Test.get_changed_tests("") == {"//t1", "//t2"} + assert Test._get_changed_tests("") == {"//t1", "//t2"} @mock.patch.dict( @@ -413,7 +417,62 @@ def test_get_changed_tests( @mock.patch("subprocess.check_output") def test_get_human_specified_tests(mock_check_output, mock_check_call) -> None: mock_check_output.return_value = b"hi\n@microcheck //test01 //test02\nthere" - assert Test.get_human_specified_tests("") == {"//test01", "//test02"} + assert Test._get_human_specified_tests("") == {"//test01", "//test02"} + + +def test_gen_microcheck_tests() -> None: + test_harness = [ + { + "input": [], + "changed_tests": set(), + "human_tests": set(), + "output": set(), + }, + { + "input": [ + _stub_test( + { + "name": "linux://core_good", + "team": "core", + Test.KEY_IS_HIGH_IMPACT: "true", + } + ), + _stub_test( + { + "name": "linux://serve_good", + "team": "serve", + Test.KEY_IS_HIGH_IMPACT: "true", + } + ), + ], + "changed_tests": {"//core_new"}, + "human_tests": {"//human_test"}, + "output": { + "//core_good", + "//core_new", + "//human_test", + }, + }, + ] + for test in test_harness: + with mock.patch( + "ray_release.test.Test.gen_from_s3", + return_value=test["input"], + ), mock.patch( + "ray_release.test.Test._get_changed_tests", + return_value=test["changed_tests"], + ), mock.patch( + "ray_release.test.Test._get_human_specified_tests", + return_value=test["human_tests"], + ): + assert ( + Test.gen_microcheck_tests( + prefix="linux", + bazel_workspace_dir="", + team="core", + ) + == test["output"] + ) if __name__ == "__main__": From bbbf51e6a0f258ed33a8114a0a7dfb285c0686dd Mon Sep 17 00:00:00 2001 From: Sven Mika Date: Mon, 27 May 2024 10:12:35 +0200 Subject: [PATCH 21/65] [RLlib] Fix metrics bug (wrt individual agent returns) in `MultiAgentEnvRunner`. (#45543) --- rllib/env/multi_agent_env_runner.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/rllib/env/multi_agent_env_runner.py b/rllib/env/multi_agent_env_runner.py index 043083aaa11a..ddad8b123815 100644 --- a/rllib/env/multi_agent_env_runner.py +++ b/rllib/env/multi_agent_env_runner.py @@ -610,8 +610,9 @@ def get_metrics(self) -> ResultDict: episode_return += return_eps2 episode_duration_s += eps2.get_duration_s() for sa_eps in eps2.agent_episodes.values(): - agent_episode_returns[str(sa_eps.agent_id)] += return_eps2 - module_episode_returns[sa_eps.module_id] += return_eps2 + return_sa = sa_eps.get_return() + agent_episode_returns[str(sa_eps.agent_id)] += return_sa + module_episode_returns[sa_eps.module_id] += return_sa del self._ongoing_episodes_for_metrics[eps.id_] self._log_episode_metrics( From baffe070eb64f9dc36797e9cab59214da9a3b811 Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Mon, 27 May 2024 21:18:13 -0700 Subject: [PATCH 22/65] [Core] Adjust NodeDeathInfo in raylet (#45533) In #45357, in the case of preemption, and after the draining deadline, we expect autoscaler to directly SIGKILL the node without sending SIGTERM first. However, autoscaler would send SIGTERM first then SIGKILL. Therefore, this PR does the following: - At Raylet, distinguish if a SIGTERM is initiated from preemption draining or normal termination based on existing draining request - Construct node death info accordingly and send UnregisterNode() RPC to GCS Signed-off-by: Rui Qiao --- python/ray/tests/test_draining.py | 21 ++++++++---- src/ray/raylet/main.cc | 20 +++++++++-- src/ray/raylet/node_manager.cc | 11 ++---- src/ray/raylet/node_manager.h | 5 +++ .../placement_group_resource_manager_test.cc | 5 +-- .../scheduling/cluster_resource_scheduler.h | 1 + .../cluster_resource_scheduler_test.cc | 6 ++-- .../scheduling/cluster_task_manager_test.cc | 24 ++++++------- .../scheduling/local_resource_manager.cc | 34 +++++++++++++------ .../scheduling/local_resource_manager.h | 31 ++++++++++------- .../scheduling/local_resource_manager_test.cc | 10 +++--- 11 files changed, 105 insertions(+), 63 deletions(-) diff --git a/python/ray/tests/test_draining.py b/python/ray/tests/test_draining.py index 864b4322da65..360e5d5d34ad 100644 --- a/python/ray/tests/test_draining.py +++ b/python/ray/tests/test_draining.py @@ -143,7 +143,11 @@ def ping(self): assert worker_node["DeathReasonMessage"] == "preemption" -def test_preemption_after_draining_deadline(monkeypatch, ray_start_cluster): +@pytest.mark.parametrize( + "graceful", + [True, False], +) +def test_preemption_after_draining_deadline(monkeypatch, ray_start_cluster, graceful): monkeypatch.setenv("RAY_health_check_failure_threshold", "3") monkeypatch.setenv("RAY_health_check_timeout_ms", "100") monkeypatch.setenv("RAY_health_check_period_ms", "1000") @@ -182,9 +186,8 @@ def ping(self): ) assert is_accepted - # Simulate node provider forcefully terminates the worker node - # after the draining deadline. - cluster.remove_node(worker_node, False) + # Simulate autoscaler terminates the worker node after the draining deadline. + cluster.remove_node(worker_node, graceful) wait_for_condition( lambda: {node["NodeID"] for node in ray.nodes() if (node["Alive"])} @@ -384,7 +387,11 @@ def get_node_id(): ray.get(obj, timeout=2) == head_node_id -def test_draining_reason(ray_start_cluster): +@pytest.mark.parametrize( + "graceful", + [False, True], +) +def test_draining_reason(ray_start_cluster, graceful): cluster = ray_start_cluster cluster.add_node(num_cpus=1, resources={"node1": 1}) ray.init( @@ -414,8 +421,8 @@ def ping(self): ) assert is_accepted - # Simulate node provider forcefully terminates the worker node - cluster.remove_node(node2, False) + # Simulate autoscaler terminates the worker node after the draining deadline. + cluster.remove_node(node2, graceful) try: ray.get(actor.ping.remote()) raise diff --git a/src/ray/raylet/main.cc b/src/ray/raylet/main.cc index caaab409f28a..2942bd50ea4a 100644 --- a/src/ray/raylet/main.cc +++ b/src/ray/raylet/main.cc @@ -439,11 +439,25 @@ int main(int argc, char *argv[]) { raylet->Start(); })); - auto signal_handler = [shutdown_raylet_gracefully_internal]( + auto signal_handler = [&raylet, shutdown_raylet_gracefully_internal]( const boost::system::error_code &error, int signal_number) { ray::rpc::NodeDeathInfo node_death_info; - node_death_info.set_reason(ray::rpc::NodeDeathInfo::EXPECTED_TERMINATION); - node_death_info.set_reason_message("received SIGTERM"); + optional drain_request = + raylet->node_manager().GetLocalDrainRequest(); + RAY_LOG(INFO) << "received SIGTERM. Existing local drain request = " + << (drain_request.has_value() ? drain_request->DebugString() : "None"); + if (drain_request.has_value() && + drain_request->reason() == + ray::rpc::autoscaler::DrainNodeReason::DRAIN_NODE_REASON_PREEMPTION && + drain_request->deadline_timestamp_ms() != 0 && + drain_request->deadline_timestamp_ms() < current_time_ms()) { + node_death_info.set_reason(ray::rpc::NodeDeathInfo::AUTOSCALER_DRAIN_PREEMPTED); + node_death_info.set_reason_message(drain_request->reason_message()); + } else { + node_death_info.set_reason(ray::rpc::NodeDeathInfo::EXPECTED_TERMINATION); + node_death_info.set_reason_message("received SIGTERM"); + } + shutdown_raylet_gracefully_internal(node_death_info); }; boost::asio::signal_set signals(main_service); diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index 0f01440b9d11..a4be13e6b533 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -1974,11 +1974,8 @@ void NodeManager::HandleDrainRaylet(rpc::DrainRayletRequest request, const bool is_idle = cluster_resource_scheduler_->GetLocalResourceManager().IsLocalNodeIdle(); if (is_idle) { - rpc::NodeDeathInfo node_death_info; - node_death_info.set_reason(rpc::NodeDeathInfo::AUTOSCALER_DRAIN_IDLE); - node_death_info.set_reason_message(request.reason_message()); cluster_resource_scheduler_->GetLocalResourceManager().SetLocalNodeDraining( - request.deadline_timestamp_ms(), node_death_info); + request); reply->set_is_accepted(true); } else { reply->set_is_accepted(false); @@ -1989,11 +1986,7 @@ void NodeManager::HandleDrainRaylet(rpc::DrainRayletRequest request, // Non-rejectable draining request. RAY_CHECK_EQ(request.reason(), rpc::autoscaler::DrainNodeReason::DRAIN_NODE_REASON_PREEMPTION); - rpc::NodeDeathInfo node_death_info; - node_death_info.set_reason(rpc::NodeDeathInfo::AUTOSCALER_DRAIN_PREEMPTED); - node_death_info.set_reason_message(request.reason_message()); - cluster_resource_scheduler_->GetLocalResourceManager().SetLocalNodeDraining( - request.deadline_timestamp_ms(), node_death_info); + cluster_resource_scheduler_->GetLocalResourceManager().SetLocalNodeDraining(request); reply->set_is_accepted(true); } diff --git a/src/ray/raylet/node_manager.h b/src/ray/raylet/node_manager.h index 4b5ff79793cf..e39bde0505c4 100644 --- a/src/ray/raylet/node_manager.h +++ b/src/ray/raylet/node_manager.h @@ -223,6 +223,11 @@ class NodeManager : public rpc::NodeManagerServiceHandler, return mutable_object_provider_; } + /// Get the local drain request. + optional GetLocalDrainRequest() const { + return cluster_resource_scheduler_->GetLocalResourceManager().GetLocalDrainRequest(); + } + private: void ReleaseWorker(const WorkerID &worker_id) { leased_workers_.erase(worker_id); diff --git a/src/ray/raylet/placement_group_resource_manager_test.cc b/src/ray/raylet/placement_group_resource_manager_test.cc index 607d82d9e25a..e650898a6e92 100644 --- a/src/ray/raylet/placement_group_resource_manager_test.cc +++ b/src/ray/raylet/placement_group_resource_manager_test.cc @@ -184,9 +184,10 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewPrepareBundleDuringDraining) ASSERT_TRUE(new_placement_group_resource_manager_->PrepareBundles(bundle1_specs)); // Drain the node, new bundle prepare will fail. - rpc::NodeDeathInfo node_death_info; + rpc::DrainRayletRequest drain_request; + drain_request.set_deadline_timestamp_ms(std::numeric_limits::max()); cluster_resource_scheduler_->GetLocalResourceManager().SetLocalNodeDraining( - std::numeric_limits::max(), node_death_info); + drain_request); ASSERT_FALSE(new_placement_group_resource_manager_->PrepareBundles(bundle2_specs)); // Prepared bundles can still be committed. new_placement_group_resource_manager_->CommitBundles(bundle1_specs); diff --git a/src/ray/raylet/scheduling/cluster_resource_scheduler.h b/src/ray/raylet/scheduling/cluster_resource_scheduler.h index fbefe86a6b3b..55b40477a64d 100644 --- a/src/ray/raylet/scheduling/cluster_resource_scheduler.h +++ b/src/ray/raylet/scheduling/cluster_resource_scheduler.h @@ -125,6 +125,7 @@ class ClusterResourceScheduler { bool requires_object_store_memory); LocalResourceManager &GetLocalResourceManager() { return *local_resource_manager_; } + ClusterResourceManager &GetClusterResourceManager() { return *cluster_resource_manager_; } diff --git a/src/ray/raylet/scheduling/cluster_resource_scheduler_test.cc b/src/ray/raylet/scheduling/cluster_resource_scheduler_test.cc index d9c3b67b4c6f..e7c3e3df5444 100644 --- a/src/ray/raylet/scheduling/cluster_resource_scheduler_test.cc +++ b/src/ray/raylet/scheduling/cluster_resource_scheduler_test.cc @@ -362,9 +362,9 @@ TEST_F(ClusterResourceSchedulerTest, NodeAffinitySchedulingStrategyTest) { ASSERT_TRUE(resource_scheduler.GetLocalResourceManager().AllocateLocalTaskResources( resource_request, task_allocation)); // Drain the local node so that it's not schedulable for new tasks. - rpc::NodeDeathInfo node_death_info; - resource_scheduler.GetLocalResourceManager().SetLocalNodeDraining( - std::numeric_limits::max(), node_death_info); + rpc::DrainRayletRequest drain_request; + drain_request.set_deadline_timestamp_ms(std::numeric_limits::max()); + resource_scheduler.GetLocalResourceManager().SetLocalNodeDraining(drain_request); scheduling_strategy.mutable_node_affinity_scheduling_strategy()->set_node_id( local_node_id.Binary()); diff --git a/src/ray/raylet/scheduling/cluster_task_manager_test.cc b/src/ray/raylet/scheduling/cluster_task_manager_test.cc index 3beed07fe7b3..2fe7eec7452a 100644 --- a/src/ray/raylet/scheduling/cluster_task_manager_test.cc +++ b/src/ray/raylet/scheduling/cluster_task_manager_test.cc @@ -743,9 +743,9 @@ TEST_F(ClusterTaskManagerTest, DrainingWhileResolving) { ASSERT_EQ(pool_.workers.size(), 1); // Drain the local node. - rpc::NodeDeathInfo node_death_info; - scheduler_->GetLocalResourceManager().SetLocalNodeDraining( - std::numeric_limits::max(), node_death_info); + rpc::DrainRayletRequest drain_request; + drain_request.set_deadline_timestamp_ms(std::numeric_limits::max()); + scheduler_->GetLocalResourceManager().SetLocalNodeDraining(drain_request); // Arg is resolved. missing_objects_.erase(missing_arg); @@ -1078,9 +1078,9 @@ TEST_F(ClusterTaskManagerTest, NotOKPopWorkerAfterDrainingTest) { AddNode(remote_node_id, 5); // Drain the local node. - rpc::NodeDeathInfo node_death_info; - scheduler_->GetLocalResourceManager().SetLocalNodeDraining( - std::numeric_limits::max(), node_death_info); + rpc::DrainRayletRequest drain_request; + drain_request.set_deadline_timestamp_ms(std::numeric_limits::max()); + scheduler_->GetLocalResourceManager().SetLocalNodeDraining(drain_request); pool_.callbacks[task1.GetTaskSpecification().GetRuntimeEnvHash()].front()( nullptr, PopWorkerStatus::WorkerPendingRegistration, ""); @@ -2637,9 +2637,9 @@ TEST_F(ClusterTaskManagerTest, PopWorkerBeforeDraining) { task_manager_.QueueAndScheduleTask(task, false, false, &reply, callback); // Drain the local node. - rpc::NodeDeathInfo node_death_info; - scheduler_->GetLocalResourceManager().SetLocalNodeDraining( - std::numeric_limits::max(), node_death_info); + rpc::DrainRayletRequest drain_request; + drain_request.set_deadline_timestamp_ms(std::numeric_limits::max()); + scheduler_->GetLocalResourceManager().SetLocalNodeDraining(drain_request); std::shared_ptr worker = std::make_shared(WorkerID::FromRandom(), 1234); @@ -2677,9 +2677,9 @@ TEST_F(ClusterTaskManagerTest, UnscheduleableWhileDraining) { AddNode(remote_node_id, 5); // Drain the local node. - rpc::NodeDeathInfo node_death_info; - scheduler_->GetLocalResourceManager().SetLocalNodeDraining( - std::numeric_limits::max(), node_death_info); + rpc::DrainRayletRequest drain_request; + drain_request.set_deadline_timestamp_ms(std::numeric_limits::max()); + scheduler_->GetLocalResourceManager().SetLocalNodeDraining(drain_request); RayTask spillback_task = CreateTask({{ray::kCPU_ResourceLabel, 1}}); rpc::RequestWorkerLeaseReply spillback_reply; diff --git a/src/ray/raylet/scheduling/local_resource_manager.cc b/src/ray/raylet/scheduling/local_resource_manager.cc index e056f8c42d17..06a1b413175a 100644 --- a/src/ray/raylet/scheduling/local_resource_manager.cc +++ b/src/ray/raylet/scheduling/local_resource_manager.cc @@ -254,9 +254,8 @@ NodeResources LocalResourceManager::ToNodeResources() const { node_resources.available = local_resources_.available.ToNodeResourceSet(); node_resources.total = local_resources_.total.ToNodeResourceSet(); node_resources.labels = local_resources_.labels; - node_resources.is_draining = is_local_node_draining_; - node_resources.draining_deadline_timestamp_ms = - local_node_draining_deadline_timestamp_ms_; + node_resources.is_draining = IsLocalNodeDraining(); + node_resources.draining_deadline_timestamp_ms = GetDrainingDeadline(); return node_resources; } @@ -330,8 +329,7 @@ void LocalResourceManager::PopulateResourceViewSyncMessage( } resource_view_sync_message.set_is_draining(IsLocalNodeDraining()); - resource_view_sync_message.set_draining_deadline_timestamp_ms( - local_node_draining_deadline_timestamp_ms_); + resource_view_sync_message.set_draining_deadline_timestamp_ms(GetDrainingDeadline()); for (const auto &iter : last_idle_times_) { if (iter.second == absl::nullopt) { @@ -383,7 +381,8 @@ std::optional LocalResourceManager::CreateSyncMessage( void LocalResourceManager::OnResourceOrStateChanged() { if (IsLocalNodeDraining() && IsLocalNodeIdle()) { RAY_LOG(INFO) << "The node is drained, continue to shut down raylet..."; - shutdown_raylet_gracefully_(node_death_info_); + rpc::NodeDeathInfo node_death_info = DeathInfoFromDrainRequest(); + shutdown_raylet_gracefully_(std::move(node_death_info)); } ++version_; @@ -393,6 +392,22 @@ void LocalResourceManager::OnResourceOrStateChanged() { resource_change_subscriber_(ToNodeResources()); } +rpc::NodeDeathInfo LocalResourceManager::DeathInfoFromDrainRequest() { + rpc::NodeDeathInfo death_info; + RAY_CHECK(drain_request_.has_value()); + if (drain_request_->reason() == + rpc::autoscaler::DrainNodeReason::DRAIN_NODE_REASON_IDLE_TERMINATION) { + death_info.set_reason(rpc::NodeDeathInfo::AUTOSCALER_DRAIN_IDLE); + death_info.set_reason_message(drain_request_->reason_message()); + } else { + RAY_CHECK_EQ(drain_request_->reason(), + rpc::autoscaler::DrainNodeReason::DRAIN_NODE_REASON_PREEMPTION); + death_info.set_reason(rpc::NodeDeathInfo::AUTOSCALER_DRAIN_PREEMPTED); + death_info.set_reason_message(drain_request_->reason_message()); + } + return death_info; +} + bool LocalResourceManager::ResourcesExist(scheduling::ResourceID resource_id) const { return local_resources_.total.Has(resource_id); } @@ -445,11 +460,8 @@ void LocalResourceManager::RecordMetrics() const { } void LocalResourceManager::SetLocalNodeDraining( - int64_t draining_deadline_timestamp_ms, const rpc::NodeDeathInfo &node_death_info) { - RAY_CHECK_GE(draining_deadline_timestamp_ms, 0); - is_local_node_draining_ = true; - local_node_draining_deadline_timestamp_ms_ = draining_deadline_timestamp_ms; - node_death_info_ = node_death_info; + const rpc::DrainRayletRequest &drain_request) { + drain_request_ = std::make_optional(drain_request); OnResourceOrStateChanged(); } diff --git a/src/ray/raylet/scheduling/local_resource_manager.h b/src/ray/raylet/scheduling/local_resource_manager.h index a238fffd8ec4..e206b7b9ec9f 100644 --- a/src/ray/raylet/scheduling/local_resource_manager.h +++ b/src/ray/raylet/scheduling/local_resource_manager.h @@ -31,6 +31,7 @@ #include "ray/gcs/gcs_client/gcs_client.h" #include "ray/util/logging.h" #include "src/ray/protobuf/gcs.pb.h" +#include "src/ray/protobuf/node_manager.pb.h" namespace ray { @@ -154,10 +155,17 @@ class LocalResourceManager : public syncer::ReporterInterface { /// Change the local node to the draining state. /// After that, no new tasks can be scheduled onto the local node. - void SetLocalNodeDraining(int64_t draining_deadline_timestamp_ms, - const rpc::NodeDeathInfo &node_death_info); + void SetLocalNodeDraining(const rpc::DrainRayletRequest &drain_request); - bool IsLocalNodeDraining() const { return is_local_node_draining_; } + bool IsLocalNodeDraining() const { return drain_request_.has_value(); } + + /// Get the local drain request. + std::optional GetLocalDrainRequest() const { + return drain_request_; + } + + /// Generate node death info from existing drain request. + rpc::NodeDeathInfo DeathInfoFromDrainRequest(); private: struct ResourceUsage { @@ -206,6 +214,12 @@ class LocalResourceManager : public syncer::ReporterInterface { absl::optional GetResourceIdleTime() const; + /// Get the draining deadline if node is in draining state. + /// + /// \return The draining deadline if node is in draining state, otherwise -1. + int64_t GetDrainingDeadline() const { + return drain_request_.has_value() ? drain_request_->deadline_timestamp_ms() : -1; + } /// Identifier of local node. scheduling::NodeID local_node_id_; /// Resources of local node. @@ -226,15 +240,8 @@ class LocalResourceManager : public syncer::ReporterInterface { // Version of this resource. It will incr by one whenever the state changed. int64_t version_ = 0; - // Whether the local node is being drained or not. - bool is_local_node_draining_ = false; - // The value is the timestamp when - // the node will be force killed. - // 0 if there is no deadline. - int64_t local_node_draining_deadline_timestamp_ms_ = -1; - - /// This is set when the node is being drained and indicates the reason for draining. - rpc::NodeDeathInfo node_death_info_; + /// The draining request this node received. + std::optional drain_request_; FRIEND_TEST(ClusterResourceSchedulerTest, SchedulingUpdateTotalResourcesTest); FRIEND_TEST(ClusterResourceSchedulerTest, AvailableResourceInstancesOpsTest); diff --git a/src/ray/raylet/scheduling/local_resource_manager_test.cc b/src/ray/raylet/scheduling/local_resource_manager_test.cc index 07d64a5331a6..c4ef3e243f7a 100644 --- a/src/ray/raylet/scheduling/local_resource_manager_test.cc +++ b/src/ray/raylet/scheduling/local_resource_manager_test.cc @@ -155,8 +155,9 @@ TEST_F(LocalResourceManagerTest, NodeDrainingTest) { manager->AllocateLocalTaskResources(resource_request, task_allocation); } - rpc::NodeDeathInfo node_death_info; - manager->SetLocalNodeDraining(std::numeric_limits::max(), node_death_info); + rpc::DrainRayletRequest drain_request; + drain_request.set_deadline_timestamp_ms(std::numeric_limits::max()); + manager->SetLocalNodeDraining(drain_request); ASSERT_TRUE(manager->IsLocalNodeDraining()); // Make the node idle so that the node is drained and terminated. @@ -182,8 +183,9 @@ TEST_F(LocalResourceManagerTest, ObjectStoreMemoryDrainingTest) { *used_object_store = 1; manager->UpdateAvailableObjectStoreMemResource(); - rpc::NodeDeathInfo node_death_info; - manager->SetLocalNodeDraining(std::numeric_limits::max(), node_death_info); + rpc::DrainRayletRequest drain_request; + drain_request.set_deadline_timestamp_ms(std::numeric_limits::max()); + manager->SetLocalNodeDraining(drain_request); ASSERT_TRUE(manager->IsLocalNodeDraining()); // Free object store memory so that the node is drained and terminated. From d9761d7d49df53383877ff9d8aeb2977270af53c Mon Sep 17 00:00:00 2001 From: Lonnie Liu <95255098+aslonnie@users.noreply.github.com> Date: Tue, 28 May 2024 07:20:56 -0700 Subject: [PATCH 23/65] [dashboard] fix test that relies on tight timing (#45561) metrics might not show up immediately Signed-off-by: Lonnie Liu --- dashboard/tests/test_dashboard.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/dashboard/tests/test_dashboard.py b/dashboard/tests/test_dashboard.py index 4170d594aeed..a5254179d948 100644 --- a/dashboard/tests/test_dashboard.py +++ b/dashboard/tests/test_dashboard.py @@ -1318,12 +1318,16 @@ async def make_blocking_call(): addr = ray_context["raylet_ip_address"] prom_addresses = [f"{addr}:{dashboard_consts.DASHBOARD_METRIC_PORT}"] - metrics_samples: Dict[str, List[Sample]] = fetch_prometheus_metrics(prom_addresses) - print(metrics_samples) + def check_lag_metrics(): + metrics_samples: Dict[str, List[Sample]] = fetch_prometheus_metrics( + prom_addresses + ) + lag_metric_samples = metrics_samples["ray_dashboard_event_loop_lag_seconds"] + assert len(lag_metric_samples) > 0 + assert any(sample.value > 1 for sample in lag_metric_samples) + return True - lag_metric_samples = metrics_samples["ray_dashboard_event_loop_lag_seconds"] - assert len(lag_metric_samples) > 0 - assert any(sample.value > 1 for sample in lag_metric_samples) + wait_for_condition(check_lag_metrics) if __name__ == "__main__": From 5f01083b543272259067b2d2636cff2c2a18aa15 Mon Sep 17 00:00:00 2001 From: simonsays1980 Date: Tue, 28 May 2024 17:14:26 +0200 Subject: [PATCH 24/65] [RLlib] EnvRunners fix min/max-metrics window sizes (from `inf` to `config.metrics_num_episodes_for_smoothing`). (#45575) --- rllib/env/multi_agent_env_runner.py | 2 ++ rllib/env/single_agent_env_runner.py | 8 ++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/rllib/env/multi_agent_env_runner.py b/rllib/env/multi_agent_env_runner.py index ddad8b123815..857b56081fd1 100644 --- a/rllib/env/multi_agent_env_runner.py +++ b/rllib/env/multi_agent_env_runner.py @@ -901,6 +901,7 @@ def _log_episode_metrics(self, length, ret, sec, agents=None, modules=None): EPISODE_RETURN_MIN: ret, }, reduce="min", + window=self.config.metrics_num_episodes_for_smoothing, ) self.metrics.log_dict( { @@ -908,4 +909,5 @@ def _log_episode_metrics(self, length, ret, sec, agents=None, modules=None): EPISODE_RETURN_MAX: ret, }, reduce="max", + window=self.config.metrics_num_episodes_for_smoothing, ) diff --git a/rllib/env/single_agent_env_runner.py b/rllib/env/single_agent_env_runner.py index 065e70781ae5..e18e32d7010a 100644 --- a/rllib/env/single_agent_env_runner.py +++ b/rllib/env/single_agent_env_runner.py @@ -818,7 +818,7 @@ def _log_episode_metrics(self, length, ret, sec): ) # For some metrics, log min/max as well. - self.metrics.log_value(EPISODE_LEN_MIN, length, reduce="min") - self.metrics.log_value(EPISODE_RETURN_MIN, ret, reduce="min") - self.metrics.log_value(EPISODE_LEN_MAX, length, reduce="max") - self.metrics.log_value(EPISODE_RETURN_MAX, ret, reduce="max") + self.metrics.log_value(EPISODE_LEN_MIN, length, reduce="min", window=win) + self.metrics.log_value(EPISODE_RETURN_MIN, ret, reduce="min", window=win) + self.metrics.log_value(EPISODE_LEN_MAX, length, reduce="max", window=win) + self.metrics.log_value(EPISODE_RETURN_MAX, ret, reduce="max", window=win) From c4d6bcfbb1473fd2f81f603e283bb8e9314a63b9 Mon Sep 17 00:00:00 2001 From: Peyton Murray Date: Tue, 28 May 2024 09:19:59 -0700 Subject: [PATCH 25/65] [Core] Add placeholder callback methods as members of RayDaskCallback (#45160) ## Why are these changes needed? This PR adds the overridable methods for `RayDaskCallback` to the base class. Previously, the user had a few choices for instantiating a `RayDaskCallback`: ```python RayDaskCallback(ray_presubmit=my_custom_presubmit_func) ``` or ```python with RayDaskCallback(ray_presubmit=my_custom_presubmit_func) as cb: ... ``` or by subclassing: ```python class MyCallback(RayDaskCallback): def _ray_presubmit(self, task, key, deps): .... ``` Currently the constructor for `RayDaskCallback` dynamically generates the callback methods and attaches them to the class instance upon instantiation using `setattr`. This pattern doesn't work with developer tooling including language servers, static code analysis tools, or documentation generators because these functions are generated dynamically. Additionally the documentation for dask-on-ray instructs users to use the third approach (using subclassing) rather than passing methods to the constructor. Finally, the docstring for `RayDaskCallback.__init__` currently has function signatures and pseudo-docstrings for the callback methods the user should be implementing. This docstring is itself malformed, so Sphinx mistakenly tries to interpret parts of this as references which do not exist. At the request of @c21 I've actually implemented these functions as they appear in the docstring. Type hints are incomplete here, but match what was already in the previous docstring. I also added a summary in the dask-on-ray documentation. Unblocks #39658. --- doc/source/ray-more-libs/dask-on-ray.rst | 12 ++ python/ray/util/dask/callbacks.py | 174 +++++++++++++---------- 2 files changed, 107 insertions(+), 79 deletions(-) diff --git a/doc/source/ray-more-libs/dask-on-ray.rst b/doc/source/ray-more-libs/dask-on-ray.rst index 5ff6578471eb..918628b8baa9 100644 --- a/doc/source/ray-more-libs/dask-on-ray.rst +++ b/doc/source/ray-more-libs/dask-on-ray.rst @@ -271,3 +271,15 @@ execution time exceeds some user-defined threshold: *submitted*, not executed. This callback API is currently unstable and subject to change. + +.. autosummary:: + :nosignatures: + :toctree: doc/ + + ray.util.dask.callbacks.RayDaskCallback + ray.util.dask.callbacks.RayDaskCallback._ray_presubmit + ray.util.dask.callbacks.RayDaskCallback._ray_postsubmit + ray.util.dask.callbacks.RayDaskCallback._ray_pretask + ray.util.dask.callbacks.RayDaskCallback._ray_posttask + ray.util.dask.callbacks.RayDaskCallback._ray_postsubmit_all + ray.util.dask.callbacks.RayDaskCallback._ray_finish diff --git a/python/ray/util/dask/callbacks.py b/python/ray/util/dask/callbacks.py index 8cbc5c311f9f..458402b059ac 100644 --- a/python/ray/util/dask/callbacks.py +++ b/python/ray/util/dask/callbacks.py @@ -1,7 +1,9 @@ import contextlib +from ray import ObjectRef from collections import namedtuple, defaultdict from datetime import datetime +from typing import Any, List, Optional from dask.callbacks import Callback @@ -48,85 +50,6 @@ class RayDaskCallback(Callback): ray_active = set() def __init__(self, **kwargs): - """ - Ray-specific callbacks: - - def _ray_presubmit(task, key, deps): - Run before submitting a Ray task. If this callback returns a - non-`None` value, a Ray task will _not_ be created and this - value will be used as the would-be task's result value. - - Args: - task: A Dask task, where the first tuple item is - the task function, and the remaining tuple items are - the task arguments (either the actual argument values, - or Dask keys into the deps dictionary whose - corresponding values are the argument values). - key: The Dask graph key for the given task. - deps: The dependencies of this task. - - Returns: - Either None, in which case a Ray task will be submitted, or - a non-None value, in which case a Ray task will not be - submitted and this return value will be used as the - would-be task result value. - - - def _ray_postsubmit(task, key, deps, object_ref): - Run after submitting a Ray task. - - Args: - task: A Dask task, where the first tuple item is - the task function, and the remaining tuple items are - the task arguments (either the actual argument values, - or Dask keys into the deps dictionary whose - corresponding values are the argument values). - key: The Dask graph key for the given task. - deps: The dependencies of this task. - object_ref (ray.ObjectRef): The object reference for the - return value of the Ray task. - - - def _ray_pretask(key, object_refs): - Run before executing a Dask task within a Ray task. This - executes after the task has been submitted, within a Ray - worker. The return value of this task will be passed to the - _ray_posttask callback, if provided. - - Args: - key: The Dask graph key for the Dask task. - object_refs (List[ray.ObjectRef]): The object references - for the arguments of the Ray task. - - Returns: - A value that will be passed to the corresponding - _ray_posttask callback, if said callback is defined. - - - def _ray_posttask(key, result, pre_state): - Run after executing a Dask task within a Ray task. This - executes within a Ray worker. This callback receives the return - value of the _ray_pretask callback, if provided. - - Args: - key: The Dask graph key for the Dask task. - result: The task result value. - pre_state: The return value of the corresponding - _ray_pretask callback, if said callback is defined. - - - def _ray_postsubmit_all(object_refs, dsk): - Run after all Ray tasks have been submitted. - - Args: - object_refs (List[ray.ObjectRef]): The object references - for the output (leaf) Ray tasks of the task graph. - dsk: The Dask graph. - - - def _ray_finish(result): - Run after all Ray tasks have finished executing and the final - result has been returned. - - Args: - result: The final result (output) of the Dask - computation, before any repackaging is done by - Dask collection-specific post-compute callbacks. - """ for cb in CBS: cb_func = kwargs.pop(cb, None) if cb_func is not None: @@ -156,6 +79,99 @@ def unregister(self): type(self).ray_active.remove(self._ray_callback) super().unregister() + def _ray_presubmit(self, task, key, deps) -> Optional[Any]: + """Run before submitting a Ray task. + + If this callback returns a non-`None` value, Ray does _not_ create + a task and uses this value as the would-be task's result value. + + Args: + task: A Dask task, where the first tuple item is + the task function, and the remaining tuple items are + the task arguments, which are either the actual argument values, + or Dask keys into the deps dictionary whose + corresponding values are the argument values. + key: The Dask graph key for the given task. + deps: The dependencies of this task. + + Returns: + Either None, in which case Ray submits a task, or + a non-None value, in which case Ray task doesn't submit + a task and uses this return value as the + would-be task result value. + """ + pass + + def _ray_postsubmit(self, task, key, deps, object_ref: ObjectRef): + """Run after submitting a Ray task. + + Args: + task: A Dask task, where the first tuple item is + the task function, and the remaining tuple items are + the task arguments, which are either the actual argument values, + or Dask keys into the deps dictionary whose + corresponding values are the argument values. + key: The Dask graph key for the given task. + deps: The dependencies of this task. + object_ref: The object reference for the + return value of the Ray task. + + """ + pass + + def _ray_pretask(self, key, object_refs: List[ObjectRef]): + """Run before executing a Dask task within a Ray task. + + This method executes after Ray submits the task within a Ray + worker. Ray passes the return value of this task to the + _ray_posttask callback, if provided. + + Args: + key: The Dask graph key for the Dask task. + object_refs: The object references + for the arguments of the Ray task. + + Returns: + A value that Ray passes to the corresponding + _ray_posttask callback, if the callback is defined. + """ + pass + + def _ray_posttask(self, key, result, pre_state): + """Run after executing a Dask task within a Ray task. + + This method executes within a Ray worker. This callback receives the + return value of the _ray_pretask callback, if provided. + + Args: + key: The Dask graph key for the Dask task. + result: The task result value. + pre_state: The return value of the corresponding + _ray_pretask callback, if said callback is defined. + """ + pass + + def _ray_postsubmit_all(self, object_refs: List[ObjectRef], dsk): + """Run after Ray submits all tasks. + + Args: + object_refs: The object references + for the output (leaf) Ray tasks of the task graph. + dsk: The Dask graph. + """ + pass + + def _ray_finish(self, result): + """Run after Ray finishes executing all Ray tasks and returns the final + result. + + Args: + result: The final result (output) of the Dask + computation, before any repackaging is done by + Dask collection-specific post-compute callbacks. + """ + pass + class add_ray_callbacks: def __init__(self, *callbacks): From ce0932baa4b3dc535ceb968cea1aca354a5b983e Mon Sep 17 00:00:00 2001 From: Hongchao Deng Date: Tue, 28 May 2024 10:26:37 -0700 Subject: [PATCH 26/65] [core] add method to kill aws instance to simulate chaos (#45546) Signed-off-by: hongchaodeng --- python/ray/_private/test_utils.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/python/ray/_private/test_utils.py b/python/ray/_private/test_utils.py index 16fc00dbb939..62e51016af2e 100644 --- a/python/ray/_private/test_utils.py +++ b/python/ray/_private/test_utils.py @@ -1533,6 +1533,25 @@ def _kill_resource(self, node_id, node_to_kill_ip, node_to_kill_port): ) self.killed.add(node_id) + def _terminate_ec2_instance(self, ip): + # This command uses IMDSv2 to get the host instance id and region. + # After that it terminates itself using aws cli. + multi_line_command = ( + 'TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600");' # noqa: E501 + 'instanceId=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/instance-id/);' # noqa: E501 + 'region=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/placement/region);' # noqa: E501 + "aws ec2 terminate-instances --region $region --instance-ids $instanceId" # noqa: E501 + ) + # This is a feature on Anyscale platform that enables + # easy ssh access to worker nodes. + ssh_command = f"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p 2222 ray@{ip} '{multi_line_command}'" # noqa: E501 + + result = subprocess.run( + ssh_command, shell=True, capture_output=True, text=True, check=True + ) + print(f"STDOUT:\n{result.stdout}\n") + print(f"STDERR:\n{result.stderr}\n") + def _kill_raylet(self, ip, port, graceful=False): import grpc from grpc._channel import _InactiveRpcError From fa0e6d7dbab6d9e36c7b39889065d012b8ffbe1a Mon Sep 17 00:00:00 2001 From: Peyton Murray Date: Tue, 28 May 2024 10:44:16 -0700 Subject: [PATCH 27/65] [Doc] Rework Serve example page (#45231) ## Why are these changes needed? This PR changes the grouping of the Serve examples page, and allows the grouping of any example page to be configurable by changing the value of the `groupby` key in `examples.yml`. ## Related issue number Closes #44457. ## Checks - [x] I've signed off every commit(by using the -s flag, i.e., `git commit -s`) in this PR. - [x] I've run `scripts/format.sh` to lint the changes in this PR. - [x] I've included any doc changes needed for https://docs.ray.io/en/master/. - [ ] I've added any new APIs to the API Reference. For example, if I added a method in Tune, I've added it in `doc/source/tune/api/` under the corresponding `.rst` file. - [x] I've made sure the tests are passing. Note that there might be a few flaky tests, see the recent failures at https://flakey-tests.ray.io/ - Testing Strategy - [x] Unit tests - [ ] Release tests - [ ] This PR is not tested :( --------- Signed-off-by: pdmurray Signed-off-by: Peyton Murray Co-authored-by: angelinalg <122562471+angelinalg@users.noreply.github.com> --- doc/source/custom_directives.py | 77 +++++++++++++++++++++++++++++---- doc/source/data/examples.yml | 1 + doc/source/serve/examples.yml | 14 +++++- doc/source/train/examples.yml | 3 +- 4 files changed, 84 insertions(+), 11 deletions(-) diff --git a/doc/source/custom_directives.py b/doc/source/custom_directives.py index ed6e1b165975..3e45de0cb91a 100644 --- a/doc/source/custom_directives.py +++ b/doc/source/custom_directives.py @@ -1,6 +1,7 @@ from collections.abc import Iterable from enum import Enum import re +from collections import defaultdict import sphinx from typing import List, Dict, Union, Callable, Any, Optional, Tuple import copy @@ -403,6 +404,10 @@ def formatted_name(cls: type) -> str: """Return the formatted name for the class.""" raise NotImplementedError + @classmethod + def key(cls: type) -> str: + raise NotImplementedError + class Contributor(ExampleEnum): RAY_TEAM = "Maintained by the Ray Team" @@ -418,6 +423,10 @@ def tag(self): def formatted_name(cls): return "All Examples" + @classmethod + def key(cls: type) -> str: + return "contributor" + class UseCase(ExampleEnum): """Use case type for example metadata.""" @@ -431,6 +440,10 @@ class UseCase(ExampleEnum): def formatted_name(cls): return "Use Case" + @classmethod + def key(cls: type) -> str: + return "use_case" + class SkillLevel(ExampleEnum): """Skill level type for example metadata.""" @@ -443,6 +456,10 @@ class SkillLevel(ExampleEnum): def formatted_name(cls): return "Skill Level" + @classmethod + def key(cls: type) -> str: + return "skill_level" + class Framework(ExampleEnum): """Framework type for example metadata.""" @@ -462,6 +479,24 @@ class Framework(ExampleEnum): def formatted_name(cls): return "Framework" + @classmethod + def key(cls: type) -> str: + return "framework" + + +class RelatedTechnology(ExampleEnum): + ML_APPLICATIONS = "ML Applications" + INTEGRATIONS = "Integrations" + AI_ACCELERATORS = "AI Accelerators" + + @classmethod + def formatted_name(cls): + return "Related Technology" + + @classmethod + def key(cls: type) -> str: + return "related_technology" + class Library(ExampleEnum): """Library type for example metadata.""" @@ -474,6 +509,10 @@ class Library(ExampleEnum): def formatted_name(cls): return "Library" + @classmethod + def key(cls: type) -> str: + return "library" + @classmethod def from_path(cls, path: Union[pathlib.Path, str]) -> "Library": """Instantiate a Library instance from a path. @@ -534,6 +573,9 @@ def __init__( else: self.contributor = Contributor.RAY_TEAM + related_technology = config.get("related_technology", "").strip() + if related_technology: + self.related_technology = RelatedTechnology(related_technology) self.skill_level = SkillLevel(config.get("skill_level")) self.title = config.get("title") @@ -590,6 +632,17 @@ def __init__( self.columns_to_show = self.parse_columns_to_show( config.get("columns_to_show", []) ) + groupby = config.get("groupby", "skill_level") + for cls in ExampleEnum.__subclasses__(): + if cls.key() == groupby: + self.groupby = cls + break + else: + valid_classes = [cls.key() for cls in ExampleEnum.__subclasses__()] + raise ValueError( + f"Unable to find class to group example entries by {groupby}. " + f"Valid choices are {valid_classes}.", + ) def parse_columns_to_show(self, columns: str) -> Dict[str, type]: """Parse the columns to show in the library example page for the config. @@ -675,17 +728,23 @@ def render_library_examples(config: pathlib.Path = None) -> bs4.BeautifulSoup: if config is None: config = (pathlib.Path(app.confdir) / pagename).parent / "examples.yml" - # Separate the examples into different skill levels - examples = { - SkillLevel.BEGINNER: [], - SkillLevel.INTERMEDIATE: [], - SkillLevel.ADVANCED: [], - } # Keep track of whether any of the examples have frameworks metadata; the # column will not be shown if no frameworks metadata exists on any example. + + # Group the examples by the ExampleConfig.groupby value: + examples = defaultdict(list) example_config = ExampleConfig(config, app.srcdir) for example in example_config: - examples[example.skill_level].append(example) + try: + group = getattr(example, example_config.groupby.key()) + except AttributeError as e: + raise AttributeError( + f"Example {example.link} has no {example_config.groupby.key()} " + "key, but needs one because the examples for library " + f"{example_config.library.value} are configured to be grouped " + f"by {example_config.groupby.key()}." + ) from e + examples[group].append(example) # Construct a table of examples soup = bs4.BeautifulSoup() @@ -700,12 +759,12 @@ def render_library_examples(config: pathlib.Path = None) -> bs4.BeautifulSoup: soup.append(page_text) container = soup.new_tag("div", attrs={"class": "example-index"}) - for level, examples in examples.items(): + for group, examples in examples.items(): if not examples: continue header = soup.new_tag("h2", attrs={"class": "example-header"}) - header.append(level.value) + header.append(group.value) container.append(header) table = soup.new_tag("table", attrs={"class": ["table", "example-table"]}) diff --git a/doc/source/data/examples.yml b/doc/source/data/examples.yml index 8f25b358b969..291893550f0c 100644 --- a/doc/source/data/examples.yml +++ b/doc/source/data/examples.yml @@ -1,6 +1,7 @@ text: Below are examples for using Ray Data for batch inference workloads with a variety of frameworks and use cases. columns_to_show: - frameworks +groupby: skill_level examples: - title: Image Classification Batch Inference with PyTorch ResNet152 skill_level: beginner diff --git a/doc/source/serve/examples.yml b/doc/source/serve/examples.yml index 9093a19c53f6..93551cecd99b 100644 --- a/doc/source/serve/examples.yml +++ b/doc/source/serve/examples.yml @@ -1,4 +1,5 @@ text: Below are tutorials for exploring Ray Serve capabilities and learning how to integrate different modeling frameworks. +groupby: related_technology examples: - title: Serve ML Models skill_level: beginner @@ -7,12 +8,14 @@ examples: use_cases: - computer vision link: tutorials/serve-ml-models + related_technology: ml applications - title: Serve a Stable Diffusion Model skill_level: beginner use_cases: - computer vision - generative ai link: tutorials/stable-diffusion + related_technology: ml applications - title: Serve a Large Language Model skill_level: beginner use_cases: @@ -26,22 +29,26 @@ examples: use_cases: - natural language processing link: tutorials/text-classification + related_technology: ml applications - title: Serve an Object Detection Model skill_level: beginner use_cases: - computer vision link: tutorials/object-detection + related_technology: ml applications - title: Serve an Inference Model on AWS NeuronCores Using FastAPI skill_level: intermediate use_cases: - natural language processing link: tutorials/aws-neuron-core-inference + related_technology: ai accelerators - title: Serve an Inference with Stable Diffusion Model on AWS NeuronCores Using FastAPI skill_level: intermediate use_cases: - computer vision - generative ai link: tutorials/aws-neuron-core-inference-stable-diffusion + related_technology: ai accelerators - title: Serve a model on Intel Gaudi Accelerator skill_level: intermediate frameworks: @@ -50,6 +57,7 @@ examples: - generative ai - large language models link: tutorials/intel-gaudi-inference + related_technology: ai accelerators - title: Scale a Gradio App with Ray Serve skill_level: intermediate use_cases: @@ -57,6 +65,7 @@ examples: - large language models - natural language processing link: tutorials/gradio-integration + related_technology: integrations - title: Serve a Text Generator with Request Batching skill_level: intermediate use_cases: @@ -64,6 +73,7 @@ examples: - large language models - natural language processing link: tutorials/batch + related_technology: integrations - title: Serve a Chatbot with Request and Response Streaming skill_level: intermediate use_cases: @@ -71,13 +81,15 @@ examples: - large language models - natural language processing link: tutorials/streaming + related_technology: ml applications - title: Serving models with Triton Server in Ray Serve skill_level: intermediate use_cases: - computer vision - generative ai link: tutorials/triton-server-integration - + related_technology: integrations - title: Serve a Java App skill_level: advanced link: tutorials/java + related_technology: integrations diff --git a/doc/source/train/examples.yml b/doc/source/train/examples.yml index d46b53964b41..3130c590c911 100644 --- a/doc/source/train/examples.yml +++ b/doc/source/train/examples.yml @@ -1,6 +1,7 @@ -text: Below are examples for using Ray Train with a variety of frameworks and use cases. Ray Train makes it easy to scale out each of these examples to a large cluster of GPUs! +text: Below are examples for using Ray Train with a variety of frameworks and use cases. Ray Train makes it easy to scale out each of these examples to a large cluster of GPUs. columns_to_show: - frameworks +groupby: skill_level examples: - title: Train an image classifier with PyTorch skill_level: beginner From 678823dd0788abe6f776c22311a1ecf62f387869 Mon Sep 17 00:00:00 2001 From: Sven Mika Date: Tue, 28 May 2024 20:44:41 +0200 Subject: [PATCH 28/65] [RLlib] Change/fix MetricsLogger `log_n_dicts()` logic. (#45585) --- rllib/BUILD | 2 +- rllib/algorithms/algorithm.py | 8 +- rllib/algorithms/dqn/dqn.py | 12 +- rllib/algorithms/ppo/ppo.py | 8 +- rllib/core/learner/learner_group.py | 4 +- .../examples/evaluation/custom_evaluation.py | 2 +- rllib/tuned_examples/dqn/cartpole_dqn.py | 3 +- rllib/tuned_examples/ppo/cartpole_ppo.py | 6 +- rllib/utils/metrics/metrics_logger.py | 131 ++++++++++- rllib/utils/metrics/stats.py | 222 ++++++++++++------ 10 files changed, 300 insertions(+), 98 deletions(-) diff --git a/rllib/BUILD b/rllib/BUILD index 097ad26ca80c..e6966fa341b6 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -376,7 +376,7 @@ py_test( py_test( name = "learning_tests_cartpole_ppo", main = "tuned_examples/ppo/cartpole_ppo.py", - tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "no_tf_static_graph"], + tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "torch_only"], size = "large", srcs = ["tuned_examples/ppo/cartpole_ppo.py"], args = ["--as-test", "--enable-new-api-stack"] diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index 5646fcd2eee1..6c4e771f0ac4 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -1304,7 +1304,7 @@ def _env_runner_remote(worker, num, round, iter): ) num_episodes = env_runner_results[NUM_EPISODES] else: - self.metrics.log_n_dicts( + self.metrics.merge_and_log_n_dicts( all_metrics, key=(EVALUATION_RESULTS, ENV_RUNNER_RESULTS), ) @@ -1492,7 +1492,7 @@ def _env_runner_remote(worker, num, round, iter): ) num_episodes = env_runner_results[NUM_EPISODES] else: - self.metrics.log_n_dicts( + self.metrics.merge_and_log_n_dicts( all_metrics, key=(EVALUATION_RESULTS, ENV_RUNNER_RESULTS), ) @@ -1633,7 +1633,7 @@ def training_step(self) -> ResultDict: train_batch = train_batch.as_multi_agent() # Reduce EnvRunner metrics over the n EnvRunners. - self.metrics.log_n_dicts(env_runner_results, key=ENV_RUNNER_RESULTS) + self.metrics.merge_and_log_n_dicts(env_runner_results, key=ENV_RUNNER_RESULTS) # Only train if train_batch is not empty. # In an extreme situation, all rollout workers die during the @@ -3327,7 +3327,7 @@ def _run_one_training_iteration_and_evaluation_in_parallel_wo_thread( NUM_ENV_STEPS_SAMPLED_THIS_ITER, 0 ) else: - self.metrics.log_n_dicts( + self.metrics.merge_and_log_n_dicts( all_metrics, key=(EVALUATION_RESULTS, ENV_RUNNER_RESULTS), ) diff --git a/rllib/algorithms/dqn/dqn.py b/rllib/algorithms/dqn/dqn.py index 2368b733376f..e7d61a666e1b 100644 --- a/rllib/algorithms/dqn/dqn.py +++ b/rllib/algorithms/dqn/dqn.py @@ -616,7 +616,9 @@ def _training_step_new_api_stack(self, *, with_noise_reset) -> ResultDict: # Add the sampled experiences to the replay buffer. self.local_replay_buffer.add(episodes) # Reduce EnvRunner metrics over the n EnvRunners. - self.metrics.log_n_dicts(env_runner_results, key=ENV_RUNNER_RESULTS) + self.metrics.merge_and_log_n_dicts( + env_runner_results, key=ENV_RUNNER_RESULTS + ) self.metrics.log_value( NUM_ENV_STEPS_SAMPLED_LIFETIME, @@ -683,7 +685,9 @@ def _training_step_new_api_stack(self, *, with_noise_reset) -> ResultDict: mid: {TD_ERROR_KEY: np.concatenate(s, axis=0)} for mid, s in td_errors.items() } - self.metrics.log_n_dicts(learner_results, key=LEARNER_RESULTS) + self.metrics.merge_and_log_n_dicts( + learner_results, key=LEARNER_RESULTS + ) self.metrics.log_value( NUM_ENV_STEPS_TRAINED_LIFETIME, self.metrics.peek( @@ -722,7 +726,9 @@ def _training_step_new_api_stack(self, *, with_noise_reset) -> ResultDict: timestep=current_ts, ) # log the additional results as well. - self.metrics.log_n_dicts(additional_results, key=LEARNER_RESULTS) + self.metrics.merge_and_log_n_dicts( + additional_results, key=LEARNER_RESULTS + ) # Update weights and global_vars - after learning on the local worker - # on all remote workers. diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py index 0390e639b009..ce491f9ca09d 100644 --- a/rllib/algorithms/ppo/ppo.py +++ b/rllib/algorithms/ppo/ppo.py @@ -456,7 +456,9 @@ def _training_step_new_api_stack(self) -> ResultDict: return {} # Reduce EnvRunner metrics over the n EnvRunners. - self.metrics.log_n_dicts(env_runner_results, key=ENV_RUNNER_RESULTS) + self.metrics.merge_and_log_n_dicts( + env_runner_results, key=ENV_RUNNER_RESULTS + ) # Log lifetime counts for env- and agent steps. self.metrics.log_dict( { @@ -483,7 +485,7 @@ def _training_step_new_api_stack(self) -> ResultDict: ), num_iters=self.config.num_sgd_iter, ) - self.metrics.log_n_dicts(learner_results, key=LEARNER_RESULTS) + self.metrics.merge_and_log_n_dicts(learner_results, key=LEARNER_RESULTS) self.metrics.log_dict( { NUM_ENV_STEPS_TRAINED_LIFETIME: self.metrics.peek( @@ -546,7 +548,7 @@ def _training_step_new_api_stack(self) -> ResultDict: sampled_kl_values=kl_dict, timestep=self.metrics.peek(NUM_ENV_STEPS_SAMPLED_LIFETIME), ) - self.metrics.log_n_dicts(additional_results, key=LEARNER_RESULTS) + self.metrics.merge_and_log_n_dicts(additional_results, key=LEARNER_RESULTS) return self.metrics.reduce() diff --git a/rllib/core/learner/learner_group.py b/rllib/core/learner/learner_group.py index b955ec78d1f1..acc42ef83dea 100644 --- a/rllib/core/learner/learner_group.py +++ b/rllib/core/learner/learner_group.py @@ -478,7 +478,7 @@ def _learner_update( results = tree.flatten_up_to( [[None] * len(r) for r in results], results ) - self._metrics_logger_old_and_hybrid_stack.log_n_dicts(results) + self._metrics_logger_old_and_hybrid_stack.merge_and_log_n_dicts(results) results = self._metrics_logger_old_and_hybrid_stack.reduce( # We are returning to a client (Algorithm) that does NOT make any # use of MetricsLogger (or Stats) -> Convert all values to non-Stats @@ -577,7 +577,7 @@ def additional_update( # the existing behavior of returning an already reduced dict (as if we had a # reduce_fn). if not self.config.enable_env_runner_and_connector_v2: - self._metrics_logger_old_and_hybrid_stack.log_n_dicts(results) + self._metrics_logger_old_and_hybrid_stack.merge_and_log_n_dicts(results) results = self._metrics_logger_old_and_hybrid_stack.reduce( return_stats_obj=False ) diff --git a/rllib/examples/evaluation/custom_evaluation.py b/rllib/examples/evaluation/custom_evaluation.py index 8d8e869b1146..d396ffee04df 100644 --- a/rllib/examples/evaluation/custom_evaluation.py +++ b/rllib/examples/evaluation/custom_evaluation.py @@ -143,7 +143,7 @@ def custom_eval_function( # You can compute metrics from the episodes manually, or use the Algorithm's # convenient MetricsLogger to store all evaluation metrics inside the main # algo. - algorithm.metrics.log_n_dicts( + algorithm.metrics.merge_and_log_n_dicts( env_runner_metrics, key=(EVALUATION_RESULTS, ENV_RUNNER_RESULTS) ) eval_results = algorithm.metrics.reduce( diff --git a/rllib/tuned_examples/dqn/cartpole_dqn.py b/rllib/tuned_examples/dqn/cartpole_dqn.py index 95e580a53b51..4d1f6e07381f 100644 --- a/rllib/tuned_examples/dqn/cartpole_dqn.py +++ b/rllib/tuned_examples/dqn/cartpole_dqn.py @@ -24,7 +24,7 @@ # Settings identical to old stack. model_config_dict={ "fcnet_hiddens": [256], - "fcnet_activation": "relu", + "fcnet_activation": "tanh", "epsilon": [(0, 1.0), (10000, 0.02)], "fcnet_bias_initializer": "zeros_", "post_fcnet_bias_initializer": "zeros_", @@ -40,6 +40,7 @@ "alpha": 0.6, "beta": 0.4, }, + n_step=3, double_q=True, num_atoms=1, noisy=False, diff --git a/rllib/tuned_examples/ppo/cartpole_ppo.py b/rllib/tuned_examples/ppo/cartpole_ppo.py index 3865c9991652..92ec15da7bec 100644 --- a/rllib/tuned_examples/ppo/cartpole_ppo.py +++ b/rllib/tuned_examples/ppo/cartpole_ppo.py @@ -19,7 +19,6 @@ enable_rl_module_and_learner=True, enable_env_runner_and_connector_v2=True, ) - .env_runners(num_env_runners=1) .environment("CartPole-v1") .rl_module( model_config_dict={ @@ -39,12 +38,13 @@ evaluation_num_env_runners=1, evaluation_interval=1, evaluation_parallel_to_training=True, + evaluation_config=PPOConfig.overrides(exploration=False), ) ) stop = { - f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": 100000, - f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 150.0, + f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": 200000, + f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 350.0, } diff --git a/rllib/utils/metrics/metrics_logger.py b/rllib/utils/metrics/metrics_logger.py index a329fd86fe0d..8de81deb0038 100644 --- a/rllib/utils/metrics/metrics_logger.py +++ b/rllib/utils/metrics/metrics_logger.py @@ -1,9 +1,10 @@ import logging -from typing import Any, Dict, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import tree # pip install dm_tree from ray.rllib.utils import force_tuple +from ray.rllib.utils.deprecation import Deprecated from ray.rllib.utils.metrics.stats import Stats from ray.rllib.utils.framework import try_import_tf, try_import_torch from ray.util.annotations import PublicAPI @@ -283,9 +284,9 @@ def _map(path, stat_or_value): tree.map_structure_with_path(_map, stats_dict) - def log_n_dicts( + def merge_and_log_n_dicts( self, - stats_dicts, + stats_dicts: List[Dict[str, Any]], *, key: Optional[Union[str, Tuple[str]]] = None, reduce: Optional[str] = "mean", @@ -293,11 +294,120 @@ def log_n_dicts( ema_coeff: Optional[float] = None, clear_on_reduce: bool = False, ) -> None: - """TODO (sven): docstr + """Merges n dicts, generated by n parallel components, and logs the results. + + .. testcode:: + + from ray.rllib.utils.metrics.metrics_logger import MetricsLogger + from ray.rllib.utils.test_utils import check + + # Example: n Learners logging loss stats to be merged. + # Note that losses should usually be logged with a window=1 so they don't + # get smeared over time and instead provide an accurate picture of the + # current situation. + main_logger = MetricsLogger() + + logger_learner1 = MetricsLogger() + logger_learner1.log_value("loss", 0.1, window=1) + learner1_results = logger_learner1.reduce() + + logger_learner2 = MetricsLogger() + logger_learner2.log_value("loss", 0.2, window=1) + learner2_results = logger_learner2.reduce() + + # Merge the stats from both Learners. + main_logger.merge_and_log_n_dicts( + [learner1_results, learner2_results], + key="learners", + ) + check(main_logger.peek("learners", "loss"), 0.15) + + # Example: m EnvRunners logging episode returns to be merged. + main_logger = MetricsLogger() + + logger_env_runner1 = MetricsLogger() + logger_env_runner1.log_value("mean_ret", 100.0, window=3) + logger_env_runner1.log_value("mean_ret", 200.0) + logger_env_runner1.log_value("mean_ret", 300.0) + logger_env_runner1.log_value("mean_ret", 400.0) + env_runner1_results = logger_env_runner1.reduce() + + logger_env_runner2 = MetricsLogger() + logger_env_runner2.log_value("mean_ret", 150.0, window=3) + logger_env_runner2.log_value("mean_ret", 250.0) + logger_env_runner2.log_value("mean_ret", 350.0) + logger_env_runner2.log_value("mean_ret", 450.0) + env_runner2_results = logger_env_runner2.reduce() + + # Merge the stats from both EnvRunners. + main_logger.merge_and_log_n_dicts( + [env_runner1_results, env_runner2_results], + key="env_runners", + ) + # The expected procedure is as follows: + # The individual internal values lists of the two loggers are as follows: + # env runner 1: [100, 200, 300, 400] + # env runner 2: [150, 250, 350, 450] + # Move backwards from index=-1 (each time, loop through both env runners) + # index=-1 -> [400, 450] -> reduce-mean -> [425] -> repeat 2 times (number + # of env runners) -> [425, 425] + # index=-2 -> [300, 350] -> reduce-mean -> [325] -> repeat 2 times + # -> append -> [425, 425, 325, 325] -> STOP b/c we have reached >= window. + # reverse the list -> [325, 325, 425, 425] + check( + main_logger.stats["env_runners"]["mean_ret"].values, + [325, 325, 425, 425], + ) + check(main_logger.peek("env_runners", "mean_ret"), (325 + 425 + 425) / 3) + + # Example: Lifetime sum over n parallel components' stats. + main_logger = MetricsLogger() + + logger1 = MetricsLogger() + logger1.log_value("some_stat", 50, reduce="sum", window=None) + logger1.log_value("some_stat", 25, reduce="sum", window=None) + logger1_results = logger1.reduce() + + logger2 = MetricsLogger() + logger2.log_value("some_stat", 75, reduce="sum", window=None) + logger2_results = logger2.reduce() + + # Merge the stats from both Learners. + main_logger.merge_and_log_n_dicts([logger1_results, logger2_results]) + check(main_logger.peek("some_stat"), 150) + + # Example: Sum over n parallel components' stats with a window of 3. + main_logger = MetricsLogger() + + logger1 = MetricsLogger() + logger1.log_value("some_stat", 50, reduce="sum", window=3) + logger1.log_value("some_stat", 25, reduce="sum") + logger1.log_value("some_stat", 10, reduce="sum") + logger1.log_value("some_stat", 5, reduce="sum") + logger1_results = logger1.reduce() + + logger2 = MetricsLogger() + logger2.log_value("some_stat", 75, reduce="sum", window=3) + logger2.log_value("some_stat", 100, reduce="sum") + logger2_results = logger2.reduce() + + # Merge the stats from both Learners. + main_logger.merge_and_log_n_dicts([logger1_results, logger2_results]) + # The expected procedure is as follows: + # The individual internal values lists of the two loggers are as follows: + # env runner 1: [50, 25, 10, 5] + # env runner 2: [75, 100] + # Move backwards from index=-1 (each time, loop through both loggers) + # index=-1 -> [5, 100] -> leave as-is, b/c we are sum'ing -> [5, 100] + # index=-2 -> [10, 75] -> leave as-is -> [5, 100, 10, 75] -> STOP b/c we + # have reached >= window. + # reverse the list -> [75, 10, 100, 5] + check(main_logger.peek("some_stat"), 115) # last 3 items (window) get sum'd Args: - stats_dicts: - key: + stats_dicts: List of n stats dicts to be merged and then logged. + key: Optional top-level key under which to log all keys/key sequences + found in the n `stats_dicts`. reduce: The reduction method to apply, once `self.reduce()` is called. If None, will collect all logged values under `key` in a list (and also return that list upon calling `self.reduce()`). @@ -637,7 +747,8 @@ def reduce( MetricsLogger object. Component A now calls its n remote components, each of which returns an equivalent, reduced dict with `Stats` as leafs. Component A can then further log these n result dicts via its own MetricsLogger: - `logger.log_n_dicts([n returned result dicts from the remote components])`. + `logger.merge_and_log_n_dicts([n returned result dicts from the remote + components])`. .. testcode:: @@ -679,7 +790,7 @@ def reduce( # Now combine the 2 equivalent results into 1 end result dict. downstream_logger = MetricsLogger() - downstream_logger.log_n_dicts([result1, result2]) + downstream_logger.merge_and_log_n_dicts([result1, result2]) # What happens internally is that both values lists of the 2 components # are merged (concat'd) and randomly shuffled, then clipped at 10 (window # size). This is done such that no component has an "advantage" over the @@ -782,3 +893,7 @@ def _set_key(self, flat_key, stats): if key not in _dict: _dict[key] = {} _dict = _dict[key] + + @Deprecated(new="MetricsLogger.merge_and_log_n_dicts()", error=True) + def log_n_dicts(self, *args, **kwargs): + pass diff --git a/rllib/utils/metrics/stats.py b/rllib/utils/metrics/stats.py index 49793f11f4c3..eec5845fd1a9 100644 --- a/rllib/utils/metrics/stats.py +++ b/rllib/utils/metrics/stats.py @@ -299,44 +299,78 @@ def merge_in_parallel(self, *others: "Stats") -> None: Thereby, the newly incoming values of `others` are treated equally with respect to each other as well as with respect to the internal values of self. - Use this method to merge another Stats into this one that resulted from a - parallel run and metrics logging of `self` and n `others` (for example, n - Learner workers all returning a loss value). + Use this method to merge other `Stats` objects, which resulted from some + parallelly executed components, into this one. For example: n Learner workers + all returning a loss value in the form of `{"total_loss": [some value]}`. - The following examples demonstrate the parallel merging logic: + The following examples demonstrate the parallel merging logic for different + reduce- and window settings: .. testcode:: - from ray.rllib.utils.metrics.stats import Stats from ray.rllib.utils.test_utils import check - # Parallel-merge two mean stats (win=3). + # Parallel-merge two (reduce=mean) stats with window=3. stats = Stats(reduce="mean", window=3) stats1 = Stats(reduce="mean", window=3) + stats1.push(0) stats1.push(1) stats1.push(2) stats1.push(3) stats2 = Stats(reduce="mean", window=3) - stats1.push(4) - stats1.push(5) - stats1.push(6) + stats2.push(4000) + stats2.push(4) + stats2.push(5) + stats2.push(6) stats.merge_in_parallel(stats1, stats2) - check(stats.values, [3.5, 3.5, 3.5]) - - # Parallel-merge two max stats (win=3). + # Fill new merged-values list: + # - Start with index -1, moving to the start. + # - Thereby always reducing across the different Stats objects' at the + # current index. + # - The resulting reduced value (across Stats at current index) is then + # repeated AND + # added to the new merged-values list n times (where n is the number of + # Stats, across + # which we merge). + # - The merged-values list is reversed. + # Here: + # index -1: [3, 6] -> [4.5, 4.5] + # index -2: [2, 5] -> [4.5, 4.5, 3.5, 3.5] + # STOP after merged list contains >= 3 items (window size) + # reverse: [3.5, 3.5, 4.5, 4.5] + check(stats.values, [3.5, 3.5, 4.5, 4.5]) + check(stats.peek(), (3.5 + 4.5 + 4.5) / 3) # mean last 3 items (window) + + # Parallel-merge two (reduce=max) stats with window=3. stats = Stats(reduce="max", window=3) stats1 = Stats(reduce="max", window=3) stats1.push(1) stats1.push(2) stats1.push(3) stats2 = Stats(reduce="max", window=3) - stats1.push(4) - stats1.push(5) - stats1.push(6) + stats2.push(4) + stats2.push(5) + stats2.push(6) stats.merge_in_parallel(stats1, stats2) - check(stats.values, [6]) - - # Parallel-merge two min stats (win=4). + # Same here: Fill new merged-values list: + # - Start with index -1, moving to the start. + # - Thereby always reducing across the different Stats objects' at the + # current index. + # - The resulting reduced value (across Stats at current index) is then + # repeated AND + # added to the new merged-values list n times (where n is the number of + # Stats, across + # which we merge). + # - The merged-values list is reversed. + # Here: + # index -1: [3, 6] -> [6, 6] + # index -2: [2, 5] -> [6, 6, 5, 5] + # STOP after merged list contains >= 3 items (window size) + # reverse: [5, 5, 6, 6] + check(stats.values, [5, 5, 6, 6]) + check(stats.peek(), 6) # max is 6 + + # Parallel-merge two (reduce=min) stats with window=4. stats = Stats(reduce="min", window=4) stats1 = Stats(reduce="min", window=4) stats1.push(1) @@ -344,34 +378,55 @@ def merge_in_parallel(self, *others: "Stats") -> None: stats1.push(1) stats1.push(4) stats2 = Stats(reduce="min", window=4) - stats1.push(5) - stats1.push(0.5) - stats1.push(7) - stats1.push(8) + stats2.push(5) + stats2.push(0.5) + stats2.push(7) + stats2.push(8) stats.merge_in_parallel(stats1, stats2) - check(stats.values, [0.5]) - - # Parallel-merge two sum stats (no window). + # Same procedure: + # index -1: [4, 8] -> [4, 4] + # index -2: [1, 7] -> [4, 4, 1, 1] + # STOP after merged list contains >= 4 items (window size) + # reverse: [1, 1, 4, 4] + check(stats.values, [1, 1, 4, 4]) + check(stats.peek(), 1) # min is 1 + + # Parallel-merge two (reduce=sum) stats with no window. + # Note that when reduce="sum", we do NOT reduce across the indices of the + # parallel stats = Stats(reduce="sum") stats1 = Stats(reduce="sum") stats1.push(1) stats1.push(2) + stats1.push(0) stats1.push(3) stats2 = Stats(reduce="sum") - stats1.push(4) - stats1.push(5) - stats1.push(6) + stats2.push(4) + stats2.push(5) + stats2.push(6) + # index -1: [3, 6] -> [3, 6] (no reduction, leave values as-is) + # index -2: [0, 5] -> [3, 6, 0, 5] + # index -3: [2, 4] -> [3, 6, 0, 5, 2, 4] + # index -4: [1] -> [3, 6, 0, 5, 2, 4, 1] + # STOP after merged list contains >= 4 items (window size) + # reverse: [1, 4, 2, 5, 0, 6, 3] stats.merge_in_parallel(stats1, stats2) - check(stats.values, [21]) + check(stats.values, [1, 4, 2, 5, 0, 6, 3]) + check(stats.peek(), 21) - # Parallel-merge two "concat" stats (reduce=None; no win). + # Parallel-merge two "concat" (reduce=None) stats with no window. + # Note that when reduce=None, we do NOT reduce across the indices of the + # parallel stats = Stats(reduce=None, window=float("inf"), clear_on_reduce=True) stats1 = Stats(reduce=None, window=float("inf"), clear_on_reduce=True) stats1.push(1) stats2 = Stats(reduce=None, window=float("inf"), clear_on_reduce=True) - stats1.push(2) + stats2.push(2) + # index -1: [1, 2] -> [1, 2] (no reduction, leave values as-is) + # reverse: [2, 1] stats.merge_in_parallel(stats1, stats2) - check(stats.values, [1, 2]) + check(stats.values, [2, 1]) + check(stats.peek(), [2, 1]) Args: others: One or more other Stats objects that need to be parallely merged @@ -383,21 +438,39 @@ def merge_in_parallel(self, *others: "Stats") -> None: assert all(self._window == o._window for o in others) assert all(self._ema_coeff == o._ema_coeff for o in others) - # Extend `self`'s values by all `others`' values. - for o in others: - self.values.extend(o.values) - - # Reduce over the entire values (no matter the window size!) first. - store_win = self._window - self._window = None - reduced_value, new_values = self._reduced_values() - self._window = store_win - # x-fold the reduced_value over the actual window size and use the resulting - # list as new self.values. - if self._reduce_method == "mean": - self.values = [reduced_value] * self._window - else: - self.values = new_values + win = self._window or float("inf") + + # Take turns stepping through `self` and `*others` values, thereby moving + # backwards from last index to beginning and will up the resulting values list. + # Stop as soon as we reach the window size. + new_values = [] + tmp_values = [] + # Loop from index=-1 backward to index=start until our new_values list has + # at least a len of `win`. + for i in range(1, max(map(len, [self, *others])) + 1): + # Per index, loop through all involved stats, including `self` and add + # to `tmp_values`. + for stats in [self, *others]: + if len(stats) < i: + continue + tmp_values.append(stats.values[-i]) + + # Now reduce across `tmp_values` based on the reduce-settings of this Stats. + # TODO (sven) : explain why all this + if self._ema_coeff is not None: + new_values.extend([np.nanmean(tmp_values)] * len(tmp_values)) + elif self._reduce_method in [None, "sum"]: + new_values.extend(tmp_values) + else: + new_values.extend( + [self._reduced_values(values=tmp_values, window=float("inf"))[0]] + * len(tmp_values) + ) + tmp_values.clear() + if len(new_values) >= win: + break + + self.values = list(reversed(new_values)) def numpy(self, value: Any = None) -> "Stats": """Converts all of self's internal values to numpy (if a tensor).""" @@ -491,39 +564,47 @@ def similar_to(other: "Stats", init_value: Optional[Any] = None): clear_on_reduce=other._clear_on_reduce, ) - def _reduced_values(self) -> Tuple[Any, Any]: - """Runs a non-commited reduction procedure on the internal values list. + def _reduced_values(self, values=None, window=None) -> Tuple[Any, Any]: + """Runs a non-commited reduction procedure on given values (or `self.values`). + + Note that this method does NOT alter any state of `self` or the possibly + provided list of `values`. It only returns new values as they should be + adopted after a possible, actual reduction step. + + Args: + values: The list of values to reduce. If not None, use `self.values` + window: A possible override window setting to use (instead of + `self._window`). Use float('inf') here for an infinite window size. Returns: A tuple containing 1) the reduced value and 2) the new internal values list to be used. """ + values = values if values is not None else self.values + window = window if window is not None else self._window + + # Apply the window (if provided and not inf). + values = ( + values if window is None or window == float("inf") else values[-window:] + ) + # No reduction method. Return list as-is OR reduce list to len=window. if self._reduce_method is None: - # No window -> return all internal values. - if self._window is None or self._window == float("inf"): - return self.values, self.values - # Window -> return shortened internal values list. - else: - return self.values[-self._window :], self.values[-self._window :] + return values, values # Special case: Internal values list is empty -> return NaN. - elif len(self.values) == 0: + elif len(values) == 0: return float("nan"), [] - # Do EMA (always a "mean" reduction). - elif self._ema_coeff is not None: + + # Do EMA (always a "mean" reduction; possibly using a window). + if self._ema_coeff is not None: # Perform EMA reduction over all values in internal values list. - mean_value = self.values[0] - for v in self.values[1:]: + mean_value = values[0] + for v in values[1:]: mean_value = self._ema_coeff * v + (1.0 - self._ema_coeff) * mean_value - return mean_value, [mean_value] + return mean_value, values # Do non-EMA reduction (possibly using a window). else: - values = ( - self.values - if self._window is None or self._window == float("inf") - else self.values[-self._window :] - ) # Use the numpy/torch "nan"-prefix to ignore NaN's in our value lists. if torch and torch.is_tensor(values[0]): reduce_meth = getattr(torch, "nan" + self._reduce_method) @@ -548,16 +629,13 @@ def _reduced_values(self) -> Tuple[Any, Any]: # For window=None|inf (infinite window) and reduce != mean, we don't have to # keep any values, except the last (reduced) one. - if ( - self._window is None or self._window == float("inf") - ) and self._reduce_method != "mean": + if window in [None, float("inf")] and self._reduce_method != "mean": # TODO (sven): What if out values are torch tensors? In this case, we # would have to do reduction using `torch` above (not numpy) and only # then return the python primitive AND put the reduced new torch # tensor in `new_values`. - new_values = [reduced] + return reduced, [reduced] # In all other cases, keep the values that were also used for the reduce # operation. else: - new_values = values - return reduced, new_values + return reduced, values From ce47bcaf2e263a069df706d7481eb29a4e3b9b9d Mon Sep 17 00:00:00 2001 From: Cuong Nguyen <128072568+can-anyscale@users.noreply.github.com> Date: Tue, 28 May 2024 12:00:49 -0700 Subject: [PATCH 29/65] [ci][doc/01] move module walking class to ray_ci (#45527) Move module walking class and the logic to find class/function api to ray_ci, so that it can be tested. The `Module` class walks through all its children attribute, check if an attribute is an API (by checking a special field name) and record if it is. Test: - CI --------- Signed-off-by: can --- ci/ray_ci/doc/BUILD.bazel | 30 ++++++++++++++ ci/ray_ci/doc/api.py | 21 ++++++++++ ci/ray_ci/doc/mock_module.py | 42 +++++++++++++++++++ ci/ray_ci/doc/module.py | 78 ++++++++++++++++++++++++++++++++++++ ci/ray_ci/doc/test_module.py | 20 +++++++++ 5 files changed, 191 insertions(+) create mode 100644 ci/ray_ci/doc/BUILD.bazel create mode 100644 ci/ray_ci/doc/api.py create mode 100644 ci/ray_ci/doc/mock_module.py create mode 100644 ci/ray_ci/doc/module.py create mode 100644 ci/ray_ci/doc/test_module.py diff --git a/ci/ray_ci/doc/BUILD.bazel b/ci/ray_ci/doc/BUILD.bazel new file mode 100644 index 000000000000..85feb3222ad9 --- /dev/null +++ b/ci/ray_ci/doc/BUILD.bazel @@ -0,0 +1,30 @@ +load("@rules_python//python:defs.bzl", "py_library", "py_test") +load("@py_deps_buildkite//:requirements.bzl", ci_require = "requirement") + +py_library( + name = "doc", + srcs = glob( + ["*.py"], + exclude = [ + "test_*.py", + "cmd_*.py", + "mock_module.py", + ], + ), + visibility = ["//ci/ray_ci/doc:__subpackages__"], +) + +py_test( + name = "test_module", + size = "small", + srcs = ["test_module.py", "mock_module.py"], + exec_compatible_with = ["//:hermetic_python"], + tags = [ + "ci_unit", + "team:ci", + ], + deps = [ + ":doc", + ci_require("pytest"), + ], +) diff --git a/ci/ray_ci/doc/api.py b/ci/ray_ci/doc/api.py new file mode 100644 index 000000000000..25996fa7d436 --- /dev/null +++ b/ci/ray_ci/doc/api.py @@ -0,0 +1,21 @@ +from enum import Enum +from dataclasses import dataclass + + +class AnnotationType(Enum): + PUBLIC_API = "PublicAPI" + DEVELOPER_API = "DeveloperAPI" + DEPRECATED = "Deprecated" + UNKNOWN = "Unknown" + + +class CodeType(Enum): + CLASS = "Class" + FUNCTION = "Function" + + +@dataclass +class API: + name: str + annotation_type: AnnotationType + code_type: CodeType diff --git a/ci/ray_ci/doc/mock_module.py b/ci/ray_ci/doc/mock_module.py new file mode 100644 index 000000000000..39328ba83419 --- /dev/null +++ b/ci/ray_ci/doc/mock_module.py @@ -0,0 +1,42 @@ +from ci.ray_ci.doc.api import AnnotationType + + +def PublicAPI(*args, **kwargs): + if len(args) == 1 and len(kwargs) == 0 and callable(args[0]): + return PublicAPI()(args[0]) + + def wrap(obj): + obj._annotated = None + obj._annotated_type = AnnotationType.PUBLIC_API + return obj + + return wrap + + +def Deprecated(*args, **kwargs): + if len(args) == 1 and len(kwargs) == 0 and callable(args[0]): + return Deprecated()(args[0]) + + def wrap(obj): + obj._annotated = None + obj._annotated_type = AnnotationType.DEPRECATED + return obj + + return wrap + + +@PublicAPI +class MockClass: + """ + This class is used for testing purpose only. It should not be used in production. + """ + + pass + + +@Deprecated +def mock_function(): + """ + This function is used for testing purpose only. It should not be used in production. + """ + pass diff --git a/ci/ray_ci/doc/module.py b/ci/ray_ci/doc/module.py new file mode 100644 index 000000000000..deacb3d28b55 --- /dev/null +++ b/ci/ray_ci/doc/module.py @@ -0,0 +1,78 @@ +import importlib +import inspect +from types import ModuleType +from typing import List + +from ci.ray_ci.doc.api import API, AnnotationType, CodeType + + +class Module: + def __init__(self, module: str): + self._module = importlib.import_module(module) + self._visited = set() + self._apis = [] + + def walk(self) -> None: + self._walk(self._module) + + def get_apis(self) -> List[API]: + self.walk() + return self._apis + + def _walk(self, module: ModuleType) -> None: + """ + Depth-first search through the module and its children to find annotated classes + and functions. + """ + if module in self._visited: + return + self._visited.add(module.__hash__) + + if not self._is_valid_child(module): + return + + for child in dir(module): + attribute = getattr(module, child) + + if inspect.ismodule(attribute): + self._walk(attribute) + if inspect.isclass(attribute): + if self._is_api(attribute): + self._apis.append( + API( + name=self._fullname(attribute), + annotation_type=self._get_annotation_type(attribute), + code_type=CodeType.CLASS, + ) + ) + self._walk(attribute) + if inspect.isfunction(attribute): + if self._is_api(attribute): + self._apis.append( + API( + name=self._fullname(attribute), + annotation_type=self._get_annotation_type(attribute), + code_type=CodeType.FUNCTION, + ) + ) + + return + + def _fullname(self, module: ModuleType) -> str: + return f"{module.__module__}.{module.__qualname__}" + + def _is_valid_child(self, module: ModuleType) -> bool: + """ + This module is a valid child of the top level module if it is the top level + module itself, or its module name starts with the top level module name. + """ + module = inspect.getmodule(module) + if not hasattr(module, "__name__"): + return False + return module.__name__.startswith(self._module.__name__) + + def _is_api(self, module: ModuleType) -> bool: + return self._is_valid_child(module) and hasattr(module, "_annotated") + + def _get_annotation_type(self, module: ModuleType) -> AnnotationType: + return AnnotationType(module._annotated_type.value) diff --git a/ci/ray_ci/doc/test_module.py b/ci/ray_ci/doc/test_module.py new file mode 100644 index 000000000000..a55e41c2fd25 --- /dev/null +++ b/ci/ray_ci/doc/test_module.py @@ -0,0 +1,20 @@ +import sys +import pytest + +from ci.ray_ci.doc.module import Module +from ci.ray_ci.doc.api import AnnotationType, CodeType + + +def test_walk(): + module = Module("ci.ray_ci.doc.mock_module") + apis = module.get_apis() + assert apis[0].name == "ci.ray_ci.doc.mock_module.MockClass" + assert apis[0].annotation_type.value == AnnotationType.PUBLIC_API.value + assert apis[0].code_type.value == CodeType.CLASS.value + assert apis[1].name == "ci.ray_ci.doc.mock_module.mock_function" + assert apis[1].annotation_type.value == AnnotationType.DEPRECATED.value + assert apis[1].code_type.value == CodeType.FUNCTION.value + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", __file__])) From c3e6ecab361eef84f3ce3c8940ec413c88486db0 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 28 May 2024 12:16:29 -0700 Subject: [PATCH 30/65] [Data] Add `metadata` and `block_refs` properties to `RefBundle` (#45567) RefBundle stores data as a List[Tuple[ObjectRef, BlockMetadata]]. Often, we'll need to access either just the object references (List[ObjectRef]) or the metadata (List[BlockMetadata]). To avoid boilerplate code to access this data, this PR adds properties to separately access the object references and metadata. Signed-off-by: Balaji Veeramani --- .../interfaces/op_runtime_metrics.py | 6 ++-- .../execution/interfaces/ref_bundle.py | 28 +++++++++++++------ .../data/_internal/execution/legacy_compat.py | 5 ++-- .../execution/operators/input_data_buffer.py | 2 +- .../execution/operators/map_operator.py | 3 +- .../operators/task_pool_map_operator.py | 3 +- .../ray/data/_internal/planner/aggregate.py | 5 ++-- .../pull_based_shuffle_task_scheduler.py | 3 +- .../push_based_shuffle_task_scheduler.py | 3 +- .../_internal/planner/randomize_blocks.py | 3 +- python/ray/data/_internal/planner/sort.py | 5 ++-- python/ray/data/tests/test_operators.py | 22 ++++++++------- python/ray/data/tests/test_split.py | 2 +- .../data/tests/test_streaming_integration.py | 8 +++--- 14 files changed, 51 insertions(+), 47 deletions(-) diff --git a/python/ray/data/_internal/execution/interfaces/op_runtime_metrics.py b/python/ray/data/_internal/execution/interfaces/op_runtime_metrics.py index 5ea4f4cdc71e..51b7ab9f30e3 100644 --- a/python/ray/data/_internal/execution/interfaces/op_runtime_metrics.py +++ b/python/ray/data/_internal/execution/interfaces/op_runtime_metrics.py @@ -512,12 +512,10 @@ def on_task_finished(self, task_index: int, exception: Optional[Exception]): input_size, ) - blocks = [input[0] for input in inputs.blocks] - metadata = [input[1] for input in inputs.blocks] ctx = ray.data.context.DataContext.get_current() if ctx.enable_get_object_locations_for_metrics: - locations = ray.experimental.get_object_locations(blocks) - for block, meta in zip(blocks, metadata): + locations = ray.experimental.get_object_locations(inputs.block_refs) + for block, meta in inputs.blocks: if locations[block].get("did_spill", False): assert meta.size_bytes is not None self.obj_store_mem_spilled += meta.size_bytes diff --git a/python/ray/data/_internal/execution/interfaces/ref_bundle.py b/python/ray/data/_internal/execution/interfaces/ref_bundle.py index 6304f48432a8..c8996a1c422b 100644 --- a/python/ray/data/_internal/execution/interfaces/ref_bundle.py +++ b/python/ray/data/_internal/execution/interfaces/ref_bundle.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Optional, Tuple +from typing import List, Optional, Tuple import ray from .common import NodeIdStr @@ -58,19 +58,29 @@ def __setattr__(self, key, value): raise ValueError(f"The `{key}` field of RefBundle cannot be updated.") object.__setattr__(self, key, value) + @property + def block_refs(self) -> List[BlockMetadata]: + """List of block references in this bundle.""" + return [block_ref for block_ref, _ in self.blocks] + + @property + def metadata(self) -> List[BlockMetadata]: + """List of block metadata in this bundle.""" + return [metadata for _, metadata in self.blocks] + def num_rows(self) -> Optional[int]: """Number of rows present in this bundle, if known.""" total = 0 - for b in self.blocks: - if b[1].num_rows is None: + for m in self.metadata: + if m.num_rows is None: return None else: - total += b[1].num_rows + total += m.num_rows return total def size_bytes(self) -> int: """Size of the blocks of this bundle in bytes.""" - return sum(b[1].size_bytes for b in self.blocks) + return sum(m.size_bytes for m in self.metadata) def destroy_if_owned(self) -> int: """Clears the object store memory for these blocks if owned. @@ -79,8 +89,10 @@ def destroy_if_owned(self) -> int: The number of bytes freed. """ should_free = self.owns_blocks and DataContext.get_current().eager_free - for b in self.blocks: - trace_deallocation(b[0], "RefBundle.destroy_if_owned", free=should_free) + for block_ref in self.block_refs: + trace_deallocation( + block_ref, "RefBundle.destroy_if_owned", free=should_free + ) return self.size_bytes() if should_free else 0 def get_cached_location(self) -> Optional[NodeIdStr]: @@ -91,7 +103,7 @@ def get_cached_location(self) -> Optional[NodeIdStr]: if self._cached_location is None: # Only consider the first block in the bundle for now. TODO(ekl) consider # taking into account other blocks. - ref = self.blocks[0][0] + ref = self.block_refs[0] # This call is pretty fast for owned objects (~5k/s), so we don't need to # batch it for now. locs = ray.experimental.get_object_locations([ref]) diff --git a/python/ray/data/_internal/execution/legacy_compat.py b/python/ray/data/_internal/execution/legacy_compat.py index c216617eeb0f..aa4e7ba64ee5 100644 --- a/python/ray/data/_internal/execution/legacy_compat.py +++ b/python/ray/data/_internal/execution/legacy_compat.py @@ -204,9 +204,8 @@ def _bundles_to_block_list(bundles: Iterator[RefBundle]) -> BlockList: for ref_bundle in bundles: if not ref_bundle.owns_blocks: owns_blocks = False - for block, meta in ref_bundle.blocks: - blocks.append(block) - metadata.append(meta) + blocks.extend(ref_bundle.block_refs) + metadata.extend(ref_bundle.metadata) return BlockList(blocks, metadata, owned_by_consumer=owns_blocks) diff --git a/python/ray/data/_internal/execution/operators/input_data_buffer.py b/python/ray/data/_internal/execution/operators/input_data_buffer.py index ba781b660713..cc0447d8d26e 100644 --- a/python/ray/data/_internal/execution/operators/input_data_buffer.py +++ b/python/ray/data/_internal/execution/operators/input_data_buffer.py @@ -80,7 +80,7 @@ def _initialize_metadata(self): self._num_output_bundles = len(self._input_data) block_metadata = [] for bundle in self._input_data: - block_metadata.extend([m for (_, m) in bundle.blocks]) + block_metadata.extend(bundle.metadata) self._stats = { "input": block_metadata, } diff --git a/python/ray/data/_internal/execution/operators/map_operator.py b/python/ray/data/_internal/execution/operators/map_operator.py index 075e28bc6c1c..931dbd4aeb7c 100644 --- a/python/ray/data/_internal/execution/operators/map_operator.py +++ b/python/ray/data/_internal/execution/operators/map_operator.py @@ -370,8 +370,7 @@ def _get_next_inner(self) -> RefBundle: assert self._started bundle = self._output_queue.get_next() self._metrics.on_output_dequeued(bundle) - for _, meta in bundle.blocks: - self._output_metadata.append(meta) + self._output_metadata.extend(bundle.metadata) return bundle @abstractmethod diff --git a/python/ray/data/_internal/execution/operators/task_pool_map_operator.py b/python/ray/data/_internal/execution/operators/task_pool_map_operator.py index 8f4fc7fee2b8..42fa68030f48 100644 --- a/python/ray/data/_internal/execution/operators/task_pool_map_operator.py +++ b/python/ray/data/_internal/execution/operators/task_pool_map_operator.py @@ -63,7 +63,6 @@ def __init__( def _add_bundled_input(self, bundle: RefBundle): # Submit the task as a normal Ray task. map_task = cached_remote_fn(_map_task, num_returns="streaming") - input_blocks = [block for block, _ in bundle.blocks] ctx = TaskContext( task_idx=self._next_data_task_idx, target_max_block_size=self.actual_target_max_block_size, @@ -84,7 +83,7 @@ def _add_bundled_input(self, bundle: RefBundle): self._map_transformer_ref, data_context, ctx, - *input_blocks, + *bundle.block_refs, ) self._submit_data_task(gen, bundle) diff --git a/python/ray/data/_internal/planner/aggregate.py b/python/ray/data/_internal/planner/aggregate.py index 6cd86ad3938d..d31afc5a7c00 100644 --- a/python/ray/data/_internal/planner/aggregate.py +++ b/python/ray/data/_internal/planner/aggregate.py @@ -39,9 +39,8 @@ def fn( blocks = [] metadata = [] for ref_bundle in refs: - for block, block_metadata in ref_bundle.blocks: - blocks.append(block) - metadata.append(block_metadata) + blocks.extend(ref_bundle.block_refs) + metadata.extend(ref_bundle.metadata) if len(blocks) == 0: return (blocks, {}) unified_schema = unify_block_metadata_schema(metadata) diff --git a/python/ray/data/_internal/planner/exchange/pull_based_shuffle_task_scheduler.py b/python/ray/data/_internal/planner/exchange/pull_based_shuffle_task_scheduler.py index 1176202b9758..b2cf448f030d 100644 --- a/python/ray/data/_internal/planner/exchange/pull_based_shuffle_task_scheduler.py +++ b/python/ray/data/_internal/planner/exchange/pull_based_shuffle_task_scheduler.py @@ -41,8 +41,7 @@ def execute( # eagerly release the blocks' memory. input_blocks_list = [] for ref_bundle in refs: - for block, _ in ref_bundle.blocks: - input_blocks_list.append(block) + input_blocks_list.extend(ref_bundle.block_refs) input_num_blocks = len(input_blocks_list) input_owned = all(b.owns_blocks for b in refs) diff --git a/python/ray/data/_internal/planner/exchange/push_based_shuffle_task_scheduler.py b/python/ray/data/_internal/planner/exchange/push_based_shuffle_task_scheduler.py index 2c5130fe8933..cafd5f31f4da 100644 --- a/python/ray/data/_internal/planner/exchange/push_based_shuffle_task_scheduler.py +++ b/python/ray/data/_internal/planner/exchange/push_based_shuffle_task_scheduler.py @@ -457,8 +457,7 @@ def execute( # processed concurrently. input_blocks_list = [] for ref_bundle in refs: - for block, _ in ref_bundle.blocks: - input_blocks_list.append(block) + input_blocks_list.extend(ref_bundle.block_refs) input_owned = all(b.owns_blocks for b in refs) if map_ray_remote_args is None: diff --git a/python/ray/data/_internal/planner/randomize_blocks.py b/python/ray/data/_internal/planner/randomize_blocks.py index b2fc9c950c0e..835017f2cafd 100644 --- a/python/ray/data/_internal/planner/randomize_blocks.py +++ b/python/ray/data/_internal/planner/randomize_blocks.py @@ -22,8 +22,7 @@ def fn( nonlocal op blocks_with_metadata = [] for ref_bundle in refs: - for block, meta in ref_bundle.blocks: - blocks_with_metadata.append((block, meta)) + blocks_with_metadata.extend(ref_bundle.blocks) if len(blocks_with_metadata) == 0: return refs, {op._name: []} diff --git a/python/ray/data/_internal/planner/sort.py b/python/ray/data/_internal/planner/sort.py index fddb80a72dde..3ca5af144006 100644 --- a/python/ray/data/_internal/planner/sort.py +++ b/python/ray/data/_internal/planner/sort.py @@ -32,9 +32,8 @@ def fn( blocks = [] metadata = [] for ref_bundle in refs: - for block, block_metadata in ref_bundle.blocks: - blocks.append(block) - metadata.append(block_metadata) + blocks.extend(ref_bundle.block_refs) + metadata.extend(ref_bundle.metadata) if len(blocks) == 0: return (blocks, {}) sort_key.validate_schema(unify_block_metadata_schema(metadata)) diff --git a/python/ray/data/tests/test_operators.py b/python/ray/data/tests/test_operators.py index b48d94c79fb7..54ea30c17234 100644 --- a/python/ray/data/tests/test_operators.py +++ b/python/ray/data/tests/test_operators.py @@ -44,8 +44,8 @@ def _get_blocks(bundle: RefBundle, output_list: List[Block]): - for block, _ in bundle.blocks: - output_list.append(list(ray.get(block)["id"])) + for block_ref in bundle.block_refs: + output_list.append(list(ray.get(block_ref)["id"])) def _mul2_transform(block_iter: Iterable[Block], ctx) -> Iterable[Block]: @@ -196,9 +196,11 @@ def test_split_operator(ray_start_regular_shared, equal, chunk_size): while op.has_next(): ref = op.get_next() assert ref.owns_blocks, ref - for block, _ in ref.blocks: + for block_ref in ref.block_refs: assert ref.output_split_idx is not None - output_splits[ref.output_split_idx].extend(list(ray.get(block)["id"])) + output_splits[ref.output_split_idx].extend( + list(ray.get(block_ref)["id"]) + ) op.all_inputs_done() expected_splits = [[] for _ in range(num_splits)] @@ -234,8 +236,8 @@ def test_split_operator_random(ray_start_regular_shared, equal, random_seed): while op.has_next(): ref = op.get_next() assert ref.owns_blocks, ref - for block, _ in ref.blocks: - output_splits[ref.output_split_idx].extend(list(ray.get(block)["id"])) + for block_ref in ref.block_refs: + output_splits[ref.output_split_idx].extend(list(ray.get(block_ref)["id"])) if equal: actual = [len(output_splits[i]) for i in range(3)] expected = [num_inputs // 3] * 3 @@ -271,8 +273,8 @@ def get_bundle_loc(bundle): while op.has_next(): ref = op.get_next() assert ref.owns_blocks, ref - for block, _ in ref.blocks: - output_splits[ref.output_split_idx].extend(list(ray.get(block)["id"])) + for block_ref in ref.block_refs: + output_splits[ref.output_split_idx].extend(list(ray.get(block_ref)["id"])) total = 0 for i in range(2): @@ -583,8 +585,8 @@ def test_limit_operator(ray_start_regular_shared): def _get_bundles(bundle: RefBundle): output = [] - for block, _ in bundle.blocks: - output.extend(list(ray.get(block)["id"])) + for block_ref in bundle.block_refs: + output.extend(list(ray.get(block_ref)["id"])) return output diff --git a/python/ray/data/tests/test_split.py b/python/ray/data/tests/test_split.py index 8fe481338baf..db729ba93f24 100644 --- a/python/ray/data/tests/test_split.py +++ b/python/ray/data/tests/test_split.py @@ -667,7 +667,7 @@ def equalize_helper(input_block_lists: List[List[List[Any]]]): result_block_lists = [] for bundle in result: block_list = [] - for block_ref, _ in bundle.blocks: + for block_ref in bundle.block_refs: block = ray.get(block_ref) block_accessor = BlockAccessor.for_block(block) block_list.append(list(block_accessor.to_default()["id"])) diff --git a/python/ray/data/tests/test_streaming_integration.py b/python/ray/data/tests/test_streaming_integration.py index a317af32ef83..9f2672123c06 100644 --- a/python/ray/data/tests/test_streaming_integration.py +++ b/python/ray/data/tests/test_streaming_integration.py @@ -43,8 +43,8 @@ def map_fn(block_iter, _): def ref_bundles_to_list(bundles: List[RefBundle]) -> List[List[Any]]: output = [] for bundle in bundles: - for block, _ in bundle.blocks: - output.append(list(ray.get(block)["id"])) + for block_ref in bundle.block_refs: + output.append(list(ray.get(block_ref)["id"])) return output @@ -144,8 +144,8 @@ def run(self): def get_outputs(out: List[RefBundle]): outputs = [] for bundle in out: - for block, _ in bundle.blocks: - ids: pd.Series = ray.get(block)["id"] + for block_ref in bundle.block_refs: + ids: pd.Series = ray.get(block_ref)["id"] outputs.extend(ids.values) return outputs From 73f6fb26034251e39ed0aad1765443b8f2df99a0 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 28 May 2024 12:16:51 -0700 Subject: [PATCH 31/65] [Data] Remove dead code from `ray.data._internal.compute` (#45564) Signed-off-by: Balaji Veeramani --- python/ray/data/_internal/compute.py | 129 +-------------------------- 1 file changed, 2 insertions(+), 127 deletions(-) diff --git a/python/ray/data/_internal/compute.py b/python/ray/data/_internal/compute.py index 10a477589922..c5b1612ffe3f 100644 --- a/python/ray/data/_internal/compute.py +++ b/python/ray/data/_internal/compute.py @@ -1,19 +1,8 @@ import logging -import math -from typing import Any, Callable, Iterable, List, Optional, Tuple, TypeVar, Union +from typing import Any, Callable, Iterable, Optional, TypeVar, Union -from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder from ray.data._internal.execution.interfaces import TaskContext -from ray.data.block import ( - Block, - BlockAccessor, - BlockExecStats, - BlockMetadata, - BlockPartition, - UserDefinedFunction, -) -from ray.data.context import DataContext -from ray.types import ObjectRef +from ray.data.block import Block, UserDefinedFunction from ray.util.annotations import DeveloperAPI, PublicAPI logger = logging.getLogger(__name__) @@ -161,117 +150,3 @@ def is_task_compute(compute_spec: Union[str, ComputeStrategy]) -> bool: or compute_spec == "tasks" or isinstance(compute_spec, TaskPoolStrategy) ) - - -def _map_block_split( - block_fn: BlockTransform, - input_files: List[str], - fn: Optional[UserDefinedFunction], - num_blocks: int, - *blocks_and_fn_args: Union[Block, Any], - **fn_kwargs, -) -> BlockPartition: - stats = BlockExecStats.builder() - blocks, fn_args = blocks_and_fn_args[:num_blocks], blocks_and_fn_args[num_blocks:] - if fn is not None: - fn_args = (fn,) + fn_args - new_metas = [] - for new_block in block_fn(blocks, *fn_args, **fn_kwargs): - accessor = BlockAccessor.for_block(new_block) - new_meta = BlockMetadata( - num_rows=accessor.num_rows(), - size_bytes=accessor.size_bytes(), - schema=accessor.schema(), - input_files=input_files, - exec_stats=stats.build(), - ) - yield new_block - new_metas.append(new_meta) - stats = BlockExecStats.builder() - yield new_metas - - -def _map_block_nosplit( - block_fn: BlockTransform, - input_files: List[str], - fn: Optional[UserDefinedFunction], - num_blocks: int, - *blocks_and_fn_args: Union[Block, Any], - **fn_kwargs, -) -> Tuple[Block, BlockMetadata]: - stats = BlockExecStats.builder() - builder = DelegatingBlockBuilder() - blocks, fn_args = blocks_and_fn_args[:num_blocks], blocks_and_fn_args[num_blocks:] - if fn is not None: - fn_args = (fn,) + fn_args - for new_block in block_fn(blocks, *fn_args, **fn_kwargs): - builder.add_block(new_block) - new_block = builder.build() - accessor = BlockAccessor.for_block(new_block) - return new_block, accessor.get_metadata( - input_files=input_files, exec_stats=stats.build() - ) - - -def _bundle_blocks_up_to_size( - blocks: List[Tuple[ObjectRef[Block], BlockMetadata]], - target_size: int, -) -> List[Tuple[Tuple[ObjectRef[Block]], Tuple[BlockMetadata]]]: - """Group blocks into bundles that are up to (but not exceeding) the provided target - size. - """ - block_bundles: List[List[Tuple[ObjectRef[Block], BlockMetadata]]] = [] - curr_bundle: List[Tuple[ObjectRef[Block], BlockMetadata]] = [] - curr_bundle_size = 0 - for b, m in blocks: - num_rows = m.num_rows - if num_rows is None: - num_rows = float("inf") - if curr_bundle_size > 0 and curr_bundle_size + num_rows > target_size: - block_bundles.append(curr_bundle) - curr_bundle = [] - curr_bundle_size = 0 - curr_bundle.append((b, m)) - curr_bundle_size += num_rows - if curr_bundle: - block_bundles.append(curr_bundle) - if len(blocks) / len(block_bundles) >= 10: - logger.warning( - f"`batch_size` is set to {target_size}, which reduces parallelism from " - f"{len(blocks)} to {len(block_bundles)}. If the performance is worse than " - "expected, this may indicate that the batch size is too large or the " - "input block size is too small. To reduce batch size, consider decreasing " - "`batch_size` or use the default in `map_batches`. To increase input " - "block size, consider decreasing `parallelism` in read." - ) - return [tuple(zip(*block_bundle)) for block_bundle in block_bundles] - - -def _check_batch_size( - blocks_and_meta: List[Tuple[ObjectRef[Block], BlockMetadata]], - batch_size: int, - name: str, -): - """Log a warning if the provided batch size exceeds the configured target max block - size. - """ - batch_size_bytes = None - for _, meta in blocks_and_meta: - if meta.num_rows and meta.size_bytes: - batch_size_bytes = math.ceil(batch_size * (meta.size_bytes / meta.num_rows)) - break - context = DataContext.get_current() - if ( - batch_size_bytes is not None - and batch_size_bytes > context.target_max_block_size - ): - logger.warning( - f"Requested batch size {batch_size} results in batches of " - f"{batch_size_bytes} bytes for {name} tasks, which is larger than the " - f"configured target max block size {context.target_max_block_size}. This " - "may result in out-of-memory errors for certain workloads, and you may " - "want to decrease your batch size or increase the configured target max " - "block size, e.g.: " - "from ray.data.context import DataContext; " - "DataContext.get_current().target_max_block_size = 4_000_000_000" - ) From b54ee6863812c65c9a5a886d8054e7534f4c1e5e Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 28 May 2024 12:17:48 -0700 Subject: [PATCH 32/65] [Data] Add default parameter values for `BlockAccessor.get_metadata` (#45566) BlockAccessor.get_metadata has two required arguments: input_files and exec_stats. However, we usually pass None to these arguments. To simplify the code, this PR makes the arguments optional and uses None as the default value. Signed-off-by: Balaji Veeramani --- python/ray/data/_internal/arrow_block.py | 4 ++-- .../_internal/execution/operators/map_operator.py | 2 +- .../_internal/execution/operators/zip_operator.py | 2 +- python/ray/data/_internal/execution/util.py | 2 +- python/ray/data/_internal/pandas_block.py | 8 ++------ .../planner/exchange/aggregate_task_spec.py | 4 +--- .../_internal/planner/exchange/sort_task_spec.py | 4 +--- .../exchange/split_repartition_task_scheduler.py | 2 +- python/ray/data/_internal/util.py | 12 +++--------- python/ray/data/block.py | 4 +++- python/ray/data/read_api.py | 9 ++------- python/ray/data/tests/test_split.py | 6 ++---- 12 files changed, 20 insertions(+), 39 deletions(-) diff --git a/python/ray/data/_internal/arrow_block.py b/python/ray/data/_internal/arrow_block.py index b05de9ed9344..dba739f75644 100644 --- a/python/ray/data/_internal/arrow_block.py +++ b/python/ray/data/_internal/arrow_block.py @@ -559,7 +559,7 @@ def merge_sorted_blocks( blocks = TableBlockAccessor.normalize_block_types(blocks, "arrow") concat_and_sort = get_concat_and_sort_transform(DataContext.get_current()) ret = concat_and_sort(blocks, sort_key) - return ret, ArrowBlockAccessor(ret).get_metadata(None, exec_stats=stats.build()) + return ret, ArrowBlockAccessor(ret).get_metadata(exec_stats=stats.build()) @staticmethod def aggregate_combined_blocks( @@ -672,7 +672,7 @@ def gen(): break ret = builder.build() - return ret, ArrowBlockAccessor(ret).get_metadata(None, exec_stats=stats.build()) + return ret, ArrowBlockAccessor(ret).get_metadata(exec_stats=stats.build()) def block_type(self) -> BlockType: return BlockType.ARROW diff --git a/python/ray/data/_internal/execution/operators/map_operator.py b/python/ray/data/_internal/execution/operators/map_operator.py index 931dbd4aeb7c..0c8023ef5397 100644 --- a/python/ray/data/_internal/execution/operators/map_operator.py +++ b/python/ray/data/_internal/execution/operators/map_operator.py @@ -425,7 +425,7 @@ def _map_task( map_transformer.set_target_max_block_size(ctx.target_max_block_size) for b_out in map_transformer.apply_transform(iter(blocks), ctx): # TODO(Clark): Add input file propagation from input blocks. - m_out = BlockAccessor.for_block(b_out).get_metadata([], None) + m_out = BlockAccessor.for_block(b_out).get_metadata() m_out.exec_stats = stats.build() m_out.exec_stats.udf_time_s = map_transformer.udf_time() m_out.exec_stats.task_idx = ctx.task_idx diff --git a/python/ray/data/_internal/execution/operators/zip_operator.py b/python/ray/data/_internal/execution/operators/zip_operator.py index a0092d93b62d..d93e29d6ac84 100644 --- a/python/ray/data/_internal/execution/operators/zip_operator.py +++ b/python/ray/data/_internal/execution/operators/zip_operator.py @@ -240,7 +240,7 @@ def _zip_one_block( # Zip block and other blocks. result = BlockAccessor.for_block(block).zip(other_block) br = BlockAccessor.for_block(result) - return result, br.get_metadata(input_files=[], exec_stats=stats.build()) + return result, br.get_metadata(exec_stats=stats.build()) def _get_num_rows_and_bytes(block: Block) -> Tuple[int, int]: diff --git a/python/ray/data/_internal/execution/util.py b/python/ray/data/_internal/execution/util.py index 2e10895d5168..d3bf3d9f1f54 100644 --- a/python/ray/data/_internal/execution/util.py +++ b/python/ray/data/_internal/execution/util.py @@ -25,7 +25,7 @@ def make_ref_bundles(simple_data: List[List[Any]]) -> List["RefBundle"]: [ ( ray.put(block), - BlockAccessor.for_block(block).get_metadata([], None), + BlockAccessor.for_block(block).get_metadata(), ) ], owns_blocks=True, diff --git a/python/ray/data/_internal/pandas_block.py b/python/ray/data/_internal/pandas_block.py index 599ebff222c9..3d40c97221a1 100644 --- a/python/ray/data/_internal/pandas_block.py +++ b/python/ray/data/_internal/pandas_block.py @@ -514,9 +514,7 @@ def merge_sorted_blocks( ret = pd.concat(blocks, ignore_index=True) columns, ascending = sort_key.to_pandas_sort_args() ret = ret.sort_values(by=columns, ascending=ascending) - return ret, PandasBlockAccessor(ret).get_metadata( - None, exec_stats=stats.build() - ) + return ret, PandasBlockAccessor(ret).get_metadata(exec_stats=stats.build()) @staticmethod def aggregate_combined_blocks( @@ -628,9 +626,7 @@ def gen(): break ret = builder.build() - return ret, PandasBlockAccessor(ret).get_metadata( - None, exec_stats=stats.build() - ) + return ret, PandasBlockAccessor(ret).get_metadata(exec_stats=stats.build()) def block_type(self) -> BlockType: return BlockType.PANDAS diff --git a/python/ray/data/_internal/planner/exchange/aggregate_task_spec.py b/python/ray/data/_internal/planner/exchange/aggregate_task_spec.py index be621a958914..45f813ada13a 100644 --- a/python/ray/data/_internal/planner/exchange/aggregate_task_spec.py +++ b/python/ray/data/_internal/planner/exchange/aggregate_task_spec.py @@ -55,9 +55,7 @@ def map( SortKey(key), ) parts = [BlockAccessor.for_block(p).combine(key, aggs) for p in partitions] - meta = BlockAccessor.for_block(block).get_metadata( - input_files=None, exec_stats=stats.build() - ) + meta = BlockAccessor.for_block(block).get_metadata(exec_stats=stats.build()) return parts + [meta] @staticmethod diff --git a/python/ray/data/_internal/planner/exchange/sort_task_spec.py b/python/ray/data/_internal/planner/exchange/sort_task_spec.py index e6cbc8aaf72f..d4b8f0b04391 100644 --- a/python/ray/data/_internal/planner/exchange/sort_task_spec.py +++ b/python/ray/data/_internal/planner/exchange/sort_task_spec.py @@ -131,9 +131,7 @@ def map( ) -> List[Union[BlockMetadata, Block]]: stats = BlockExecStats.builder() out = BlockAccessor.for_block(block).sort_and_partition(boundaries, sort_key) - meta = BlockAccessor.for_block(block).get_metadata( - input_files=None, exec_stats=stats.build() - ) + meta = BlockAccessor.for_block(block).get_metadata(exec_stats=stats.build()) return out + [meta] @staticmethod diff --git a/python/ray/data/_internal/planner/exchange/split_repartition_task_scheduler.py b/python/ray/data/_internal/planner/exchange/split_repartition_task_scheduler.py index 946634d3d0d2..cc21f699667b 100644 --- a/python/ray/data/_internal/planner/exchange/split_repartition_task_scheduler.py +++ b/python/ray/data/_internal/planner/exchange/split_repartition_task_scheduler.py @@ -117,7 +117,7 @@ def execute( builder = PandasBlockBuilder() empty_block = builder.build() empty_meta = BlockAccessor.for_block(empty_block).get_metadata( - input_files=None, exec_stats=None + exec_stats=None ) # No stats for empty block. empty_block_refs, empty_metadata = zip( *[(ray.put(empty_block), empty_meta) for _ in range(num_empty_blocks)] diff --git a/python/ray/data/_internal/util.py b/python/ray/data/_internal/util.py index c91f34f1214b..8aa0c95fa50c 100644 --- a/python/ray/data/_internal/util.py +++ b/python/ray/data/_internal/util.py @@ -653,9 +653,7 @@ def pandas_df_to_arrow_block(df: "pandas.DataFrame") -> "Block": stats = BlockExecStats.builder() return ( block, - BlockAccessor.for_block(block).get_metadata( - input_files=None, exec_stats=stats.build() - ), + BlockAccessor.for_block(block).get_metadata(exec_stats=stats.build()), ) @@ -666,9 +664,7 @@ def ndarray_to_block(ndarray: np.ndarray, ctx: DataContext) -> "Block": stats = BlockExecStats.builder() block = BlockAccessor.batch_to_block({"data": ndarray}) - metadata = BlockAccessor.for_block(block).get_metadata( - input_files=None, exec_stats=stats.build() - ) + metadata = BlockAccessor.for_block(block).get_metadata(exec_stats=stats.build()) return block, metadata @@ -678,9 +674,7 @@ def get_table_block_metadata( from ray.data.block import BlockAccessor, BlockExecStats stats = BlockExecStats.builder() - return BlockAccessor.for_block(table).get_metadata( - input_files=None, exec_stats=stats.build() - ) + return BlockAccessor.for_block(table).get_metadata(exec_stats=stats.build()) def unify_block_metadata_schema( diff --git a/python/ray/data/block.py b/python/ray/data/block.py index 1ad73760b6c7..1f8c156c8578 100644 --- a/python/ray/data/block.py +++ b/python/ray/data/block.py @@ -331,7 +331,9 @@ def schema(self) -> Union[type, "pyarrow.lib.Schema"]: raise NotImplementedError def get_metadata( - self, input_files: List[str], exec_stats: Optional[BlockExecStats] + self, + input_files: Optional[List[str]] = None, + exec_stats: Optional[BlockExecStats] = None, ) -> BlockMetadata: """Create a metadata object from this block.""" return BlockMetadata( diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py index d30045cb3085..d3343ccd16f7 100644 --- a/python/ray/data/read_api.py +++ b/python/ray/data/read_api.py @@ -122,10 +122,7 @@ def from_blocks(blocks: List[Block]): A :class:`~ray.data.Dataset` holding the blocks. """ block_refs = [ray.put(block) for block in blocks] - metadata = [ - BlockAccessor.for_block(block).get_metadata(input_files=None, exec_stats=None) - for block in blocks - ] + metadata = [BlockAccessor.for_block(block).get_metadata() for block in blocks] from_blocks_op = FromBlocks(block_refs, metadata) logical_plan = LogicalPlan(from_blocks_op) return MaterializedDataset( @@ -207,9 +204,7 @@ def from_items( block = builder.build() blocks.append(ray.put(block)) metadata.append( - BlockAccessor.for_block(block).get_metadata( - input_files=None, exec_stats=stats.build() - ) + BlockAccessor.for_block(block).get_metadata(exec_stats=stats.build()) ) from_items_op = FromItems(blocks, metadata) diff --git a/python/ray/data/tests/test_split.py b/python/ray/data/tests/test_split.py index db729ba93f24..c9cf6f7f537b 100644 --- a/python/ray/data/tests/test_split.py +++ b/python/ray/data/tests/test_split.py @@ -93,7 +93,7 @@ def _test_equal_split_balanced(block_sizes, num_splits): for block_size in block_sizes: block = pd.DataFrame({"id": list(range(total_rows, total_rows + block_size))}) blocks.append(ray.put(block)) - metadata.append(BlockAccessor.for_block(block).get_metadata(None, None)) + metadata.append(BlockAccessor.for_block(block).get_metadata()) blk = (blocks[-1], metadata[-1]) ref_bundles.append(RefBundle((blk,), owns_blocks=True)) total_rows += block_size @@ -501,9 +501,7 @@ def _create_meta(num_rows): def _create_block_and_metadata(data: Any) -> Tuple[ObjectRef[Block], BlockMetadata]: block = pd.DataFrame({"id": data}) - metadata = BlockAccessor.for_block(block).get_metadata( - input_files=[], exec_stats=None - ) + metadata = BlockAccessor.for_block(block).get_metadata() return (ray.put(block), metadata) From bc76b19c312780c5540483063a8f08628f1500a9 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen <128072568+can-anyscale@users.noreply.github.com> Date: Tue, 28 May 2024 12:47:22 -0700 Subject: [PATCH 33/65] [ci][doc/03] add a command to query and validate apis for ray packages (#45531) As title, a command to: - Query apis for a given ray package - Print out the information for these APIs Test: - CI --------- Signed-off-by: can --- ci/lint/check_api_discrepancy.py | 73 ---------------------- ci/lint/lint.sh | 3 +- ci/ray_ci/doc/BUILD.bazel | 6 ++ ci/ray_ci/doc/cmd_check_api_discrepancy.py | 23 +++++++ 4 files changed, 31 insertions(+), 74 deletions(-) delete mode 100755 ci/lint/check_api_discrepancy.py create mode 100644 ci/ray_ci/doc/cmd_check_api_discrepancy.py diff --git a/ci/lint/check_api_discrepancy.py b/ci/lint/check_api_discrepancy.py deleted file mode 100755 index 2ecc0db68ae9..000000000000 --- a/ci/lint/check_api_discrepancy.py +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env python - -import click -import inspect -from typing import Dict, Set - -import ray -from ray.util.annotations import _is_annotated, _get_annotation_type - - -VISITED = set() -ANNOTATED_CLASSES = {} -ANNOTATED_FUNCTIONS = {} - - -@click.command() -def main() -> None: - """ - This script checks for annotated classes and functions in a module, and finds - discrepancies between the annotations and the documentation. - """ - walk(ray.data) # TODO(can): parameterize the module to walk - _print(ANNOTATED_CLASSES, "classes") - _print(ANNOTATED_FUNCTIONS, "functions") - - -def _print(annotated: Dict[str, Set[str]], prefix: str) -> None: - for type, objs in annotated.items(): - print(f"{prefix} {type}".upper()) - for obj in sorted(objs): - print(obj) - print("\n") - - -def walk(module) -> None: - """ - Depth-first search through the module and its children to find annotated classes - and functions. - """ - if module in VISITED: - return - VISITED.add(module) - if not _fullname(module).startswith("ray.data"): - return - - for child in dir(module): - attribute = getattr(module, child) - - if inspect.ismodule(attribute): - walk(attribute) - if inspect.isclass(attribute): - _add_if_annotated(attribute, ANNOTATED_CLASSES) - walk(attribute) - if inspect.isfunction(attribute) and _is_annotated(attribute): - _add_if_annotated(attribute, ANNOTATED_FUNCTIONS) - return - - -def _add_if_annotated(attribute, result: Dict[str, Set[str]]): - if not _is_annotated(attribute): - return - type = _get_annotation_type(attribute) - if type not in result: - result[type] = set() - result[type].add(_fullname(attribute)) - - -def _fullname(attr): - return inspect.getmodule(attr).__name__ + "." + attr.__name__ - - -if __name__ == "__main__": - main() diff --git a/ci/lint/lint.sh b/ci/lint/lint.sh index 0a3403e899fd..c17766d105e9 100755 --- a/ci/lint/lint.sh +++ b/ci/lint/lint.sh @@ -64,7 +64,8 @@ api_annotations() { api_discrepancy() { # shellcheck disable=SC2102 RAY_DISABLE_EXTRA_CPP=1 pip install -e python/[all] - ./ci/lint/check_api_discrepancy.py + # TODO(can): run this check with other ray packages + bazel run //ci/ray_ci/doc:cmd_check_api_discrepancy -- ray.data } documentation_style() { diff --git a/ci/ray_ci/doc/BUILD.bazel b/ci/ray_ci/doc/BUILD.bazel index 85feb3222ad9..8c34c44acf15 100644 --- a/ci/ray_ci/doc/BUILD.bazel +++ b/ci/ray_ci/doc/BUILD.bazel @@ -1,6 +1,12 @@ load("@rules_python//python:defs.bzl", "py_library", "py_test") load("@py_deps_buildkite//:requirements.bzl", ci_require = "requirement") +py_binary( + name = "cmd_check_api_discrepancy", + srcs = ["cmd_check_api_discrepancy.py"], + deps = [":doc"], +) + py_library( name = "doc", srcs = glob( diff --git a/ci/ray_ci/doc/cmd_check_api_discrepancy.py b/ci/ray_ci/doc/cmd_check_api_discrepancy.py new file mode 100644 index 000000000000..e55f2364ea61 --- /dev/null +++ b/ci/ray_ci/doc/cmd_check_api_discrepancy.py @@ -0,0 +1,23 @@ +import click + +from ci.ray_ci.doc.module import Module + + +@click.command() +@click.argument("module", required=True, type=str) +def main(module: str) -> None: + """ + This script checks for annotated classes and functions in a module, and finds + discrepancies between the annotations and the documentation. + """ + module = Module(module) + for api in module.get_apis(): + print( + f"API: {api.name}, " + f"Annotation Type: {api.annotation_type}, " + f"Code Type: {api.code_type}" + ) + + +if __name__ == "__main__": + main() From ce5a90f2fd0213ec2dbe0122a35a5660cb42e24e Mon Sep 17 00:00:00 2001 From: Sven Mika Date: Tue, 28 May 2024 23:39:35 +0200 Subject: [PATCH 34/65] [RLlib] Fix frame stacking connector for experiments with more than 1 Learner. (#45588) --- rllib/algorithms/algorithm.py | 14 ++++ rllib/algorithms/dqn/dqn.py | 19 ++++-- rllib/algorithms/sac/sac.py | 18 +++-- rllib/env/multi_agent_env_runner.py | 2 +- rllib/env/multi_agent_episode.py | 9 ++- rllib/env/single_agent_episode.py | 2 + rllib/examples/connectors/frame_stacking.py | 31 +++++---- .../envs/classes/debug_counter_env.py | 23 +++++-- rllib/tuned_examples/dqn/cartpole_dqn.py | 7 +- .../dqn/multi_agent_cartpole_dqn.py | 66 +++++++++++++++++++ .../sac/multi_agent_pendulum_sac.py | 44 ++++++------- rllib/tuned_examples/sac/pendulum_sac.py | 42 +++++------- .../multi_agent_episode_replay_buffer.py | 55 +++++++--------- 13 files changed, 211 insertions(+), 121 deletions(-) create mode 100644 rllib/tuned_examples/dqn/multi_agent_cartpole_dqn.py diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index 6c4e771f0ac4..6b71fd59dd0c 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -3413,6 +3413,20 @@ def _compile_iteration_results_new_api_stack( ): # Return dict (shallow copy of `train_results`). results: ResultDict = train_results.copy() + + # TODO (sven): Fix Tune, instead, to be tolerant against possibly missing result + # keys. Otherwise, we'll have to guess here, what "popular" keys users use in + # order to protect them from running into Tune KeyErrors. + if ENV_RUNNER_RESULTS not in results: + results[ENV_RUNNER_RESULTS] = {} + for must_have in [ + EPISODE_RETURN_MEAN, + EPISODE_RETURN_MIN, + EPISODE_RETURN_MAX, + ]: + if must_have not in results[ENV_RUNNER_RESULTS]: + results[ENV_RUNNER_RESULTS][must_have] = np.nan + # Evaluation results. if eval_results: results.update(eval_results) diff --git a/rllib/algorithms/dqn/dqn.py b/rllib/algorithms/dqn/dqn.py index e7d61a666e1b..2674f99dbbd5 100644 --- a/rllib/algorithms/dqn/dqn.py +++ b/rllib/algorithms/dqn/dqn.py @@ -11,7 +11,7 @@ from collections import defaultdict import logging -from typing import Any, Callable, Dict, List, Optional, Type, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union import numpy as np from ray.rllib.algorithms.algorithm import Algorithm @@ -127,7 +127,7 @@ def __init__(self, algo_class=None): # Overrides of AlgorithmConfig defaults # `env_runners()` # Set to `self.n_step`, if 'auto'. - self.rollout_fragment_length = "auto" + self.rollout_fragment_length: Union[int, str] = "auto" self.exploration_config = { "type": "EpsilonGreedy", "initial_epsilon": 1.0, @@ -235,7 +235,7 @@ def training( dueling: Optional[bool] = NotProvided, hiddens: Optional[int] = NotProvided, double_q: Optional[bool] = NotProvided, - n_step: Optional[int] = NotProvided, + n_step: Optional[Union[int, Tuple[int, int]]] = NotProvided, before_learn_on_batch: Callable[ [Type[MultiAgentBatch], List[Type[Policy]], Type[int]], Type[MultiAgentBatch], @@ -311,7 +311,12 @@ def training( hiddens: Dense-layer setup for each the advantage branch and the value branch double_q: Whether to use double DQN. - n_step: N-step for Q-learning. + n_step: N-step target updates. If >1, sars' tuples in trajectories will be + postprocessed to become sa[discounted sum of R][s t+n] tuples. An + integer will be interpreted as a fixed n-step value. If a tuple of 2 + ints is provided here, the n-step value will be drawn for each sample(!) + in the train batch from a uniform distribution over the closed interval + defined by `[n_step[0], n_step[1]]`. before_learn_on_batch: Callback to run before learning on a multi-agent batch of experiences. training_intensity: The intensity with which to update the model (vs @@ -477,7 +482,11 @@ def validate(self) -> None: @override(AlgorithmConfig) def get_rollout_fragment_length(self, worker_index: int = 0) -> int: if self.rollout_fragment_length == "auto": - return self.n_step + return ( + self.n_step[1] + if isinstance(self.n_step, (tuple, list)) + else self.n_step + ) else: return self.rollout_fragment_length diff --git a/rllib/algorithms/sac/sac.py b/rllib/algorithms/sac/sac.py index 8ab0c89cf3ad..fed9ae2c02ef 100644 --- a/rllib/algorithms/sac/sac.py +++ b/rllib/algorithms/sac/sac.py @@ -1,5 +1,5 @@ import logging -from typing import Any, Dict, Optional, Type, Union +from typing import Any, Dict, Optional, Tuple, Type, Union from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided from ray.rllib.algorithms.dqn.dqn import DQN @@ -86,6 +86,7 @@ def __init__(self, algo_class=None): self.target_network_update_freq = 0 # .env_runners() + # Set to `self.n_step`, if 'auto'. self.rollout_fragment_length = "auto" self.compress_observations = False self.exploration_config = { @@ -127,7 +128,7 @@ def training( tau: Optional[float] = NotProvided, initial_alpha: Optional[float] = NotProvided, target_entropy: Optional[Union[str, float]] = NotProvided, - n_step: Optional[int] = NotProvided, + n_step: Optional[Union[int, Tuple[int, int]]] = NotProvided, store_buffer_in_checkpoints: Optional[bool] = NotProvided, replay_buffer_config: Optional[Dict[str, Any]] = NotProvided, training_intensity: Optional[float] = NotProvided, @@ -169,9 +170,10 @@ def training( automatically. n_step: N-step target updates. If >1, sars' tuples in trajectories will be postprocessed to become sa[discounted sum of R][s t+n] tuples. An - integer will be interpreted as a fixed n-step value. In case of a tuple - the n-step value will be drawn for each sample in the train batch from - a uniform distribution over the interval defined by the 'n-step'-tuple. + integer will be interpreted as a fixed n-step value. If a tuple of 2 + ints is provided here, the n-step value will be drawn for each sample(!) + in the train batch from a uniform distribution over the closed interval + defined by `[n_step[0], n_step[1]]`. store_buffer_in_checkpoints: Set this to True, if you want the contents of your buffer(s) to be stored in any saved checkpoints as well. Warnings will be created if: @@ -362,7 +364,11 @@ def validate(self) -> None: @override(AlgorithmConfig) def get_rollout_fragment_length(self, worker_index: int = 0) -> int: if self.rollout_fragment_length == "auto": - return self.n_step[1] if isinstance(self.n_step, tuple) else self.n_step + return ( + self.n_step[1] + if isinstance(self.n_step, (tuple, list)) + else self.n_step + ) else: return self.rollout_fragment_length diff --git a/rllib/env/multi_agent_env_runner.py b/rllib/env/multi_agent_env_runner.py index 857b56081fd1..077adbdad589 100644 --- a/rllib/env/multi_agent_env_runner.py +++ b/rllib/env/multi_agent_env_runner.py @@ -139,7 +139,7 @@ def sample( assert not (num_timesteps is not None and num_episodes is not None) # If no execution details are provided, use the config to try to infer the - # desired timesteps/episodes to sample and exploration behavior. + # desired timesteps/episodes to sample and the exploration behavior. if explore is None: explore = self.config.explore if num_timesteps is None and num_episodes is None: diff --git a/rllib/env/multi_agent_episode.py b/rllib/env/multi_agent_episode.py index a59cec2d7a63..5ce2f44ca2b3 100644 --- a/rllib/env/multi_agent_episode.py +++ b/rllib/env/multi_agent_episode.py @@ -873,7 +873,12 @@ def concat_episode(self, other: "MultiAgentEpisode") -> None: ) # Concatenate the env- to agent-timestep mappings. - self.env_t_to_agent_t[agent_id].extend(other.env_t_to_agent_t[agent_id]) + j = self.env_t + for i, val in enumerate(other.env_t_to_agent_t[agent_id][1:]): + if val == self.SKIP_ENV_TS_TAG: + self.env_t_to_agent_t[agent_id].append(self.SKIP_ENV_TS_TAG) + else: + self.env_t_to_agent_t[agent_id].append(i + 1 + j) # Otherwise, the agent is only in `self` and not done. All data is stored # already -> skip @@ -1883,7 +1888,7 @@ def get_agents_to_act(self) -> Set[AgentID]: `env.step()` call. Returns: - A set of AgentIDs that are suposed to send actions to the next `env.step()` + A set of AgentIDs that are supposed to send actions to the next `env.step()` call. """ return { diff --git a/rllib/env/single_agent_episode.py b/rllib/env/single_agent_episode.py index 1fdb8c0ca707..19eeeb046e9a 100644 --- a/rllib/env/single_agent_episode.py +++ b/rllib/env/single_agent_episode.py @@ -1508,6 +1508,7 @@ def slice(self, slice_: slice) -> "SingleAgentEpisode": lookback=self.observations.lookback, space=self.observation_space, ), + observation_space=self.observation_space, infos=InfiniteLookbackBuffer( data=self.get_infos( slice(start - self.infos.lookback, stop + 1, step), @@ -1523,6 +1524,7 @@ def slice(self, slice_: slice) -> "SingleAgentEpisode": lookback=self.actions.lookback, space=self.action_space, ), + action_space=self.action_space, rewards=InfiniteLookbackBuffer( data=self.get_rewards( slice(start - self.rewards.lookback, stop, step), diff --git a/rllib/examples/connectors/frame_stacking.py b/rllib/examples/connectors/frame_stacking.py index 9c316e591d3a..6abce5582b0b 100644 --- a/rllib/examples/connectors/frame_stacking.py +++ b/rllib/examples/connectors/frame_stacking.py @@ -1,15 +1,3 @@ -import gymnasium as gym - -from ray.rllib.connectors.env_to_module.frame_stacking import FrameStackingEnvToModule -from ray.rllib.connectors.learner.frame_stacking import FrameStackingLearner -from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack -from ray.rllib.examples.envs.classes.multi_agent import make_multi_agent -from ray.rllib.utils.test_utils import ( - add_rllib_example_script_args, - run_rllib_example_script_experiment, -) -from ray.tune.registry import get_trainable_cls - """ Example using connectors (V2) for frame-stacking in Atari environments. How to run this script @@ -23,6 +11,17 @@ `--wandb-key=[your WandB API key] --wandb-project=[some project name] --wandb-run-name=[optional: WandB run name (within the defined project)]` """ +import gymnasium as gym + +from ray.rllib.connectors.env_to_module.frame_stacking import FrameStackingEnvToModule +from ray.rllib.connectors.learner.frame_stacking import FrameStackingLearner +from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack +from ray.rllib.examples.envs.classes.multi_agent import make_multi_agent +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls # Read in common example script command line arguments. parser = add_rllib_example_script_args( @@ -138,7 +137,7 @@ def _env_creator(cfg): ) ) if args.enable_new_api_stack: - base_config = base_config.rl_module( + base_config.rl_module( model_config_dict=dict( { "vf_share_layers": True, @@ -147,11 +146,11 @@ def _env_creator(cfg): "post_fcnet_hiddens": [256], "uses_new_env_runners": True, }, - ), + ) ) else: - base_config = base_config.training( - { + base_config.training( + model={ "vf_share_layers": True, "conv_filters": [[16, 4, 2], [32, 4, 2], [64, 4, 2], [128, 4, 2]], "conv_activation": "relu", diff --git a/rllib/examples/envs/classes/debug_counter_env.py b/rllib/examples/envs/classes/debug_counter_env.py index 24d95e56b3bf..69dc0870f62a 100644 --- a/rllib/examples/envs/classes/debug_counter_env.py +++ b/rllib/examples/envs/classes/debug_counter_env.py @@ -38,15 +38,30 @@ def __init__(self, config): super().__init__() self.num_agents = config["num_agents"] self.base_episode_len = config.get("base_episode_len", 103) - # Actions are always: - # (episodeID, envID) as floats. - self.action_space = gym.spaces.Box(-float("inf"), float("inf"), shape=(2,)) + # Observation dims: # 0=agent ID. # 1=episode ID (0.0 for obs after reset). # 2=env ID (0.0 for obs after reset). # 3=ts (of the agent). - self.observation_space = gym.spaces.Box(float("-inf"), float("inf"), (4,)) + self.observation_space = gym.spaces.Dict( + { + aid: gym.spaces.Box(float("-inf"), float("inf"), (4,)) + for aid in range(self.num_agents) + } + ) + self._obs_space_in_preferred_format = True + + # Actions are always: + # (episodeID, envID) as floats. + self.action_space = gym.spaces.Dict( + { + aid: gym.spaces.Box(-float("inf"), float("inf"), shape=(2,)) + for aid in range(self.num_agents) + } + ) + self._action_space_in_preferred_format = True + self.timesteps = [0] * self.num_agents self.terminateds = set() self.truncateds = set() diff --git a/rllib/tuned_examples/dqn/cartpole_dqn.py b/rllib/tuned_examples/dqn/cartpole_dqn.py index 4d1f6e07381f..6f64fba05a6f 100644 --- a/rllib/tuned_examples/dqn/cartpole_dqn.py +++ b/rllib/tuned_examples/dqn/cartpole_dqn.py @@ -15,11 +15,6 @@ config = ( DQNConfig() .environment(env="CartPole-v1") - .framework(framework="torch") - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .rl_module( # Settings identical to old stack. model_config_dict={ @@ -63,7 +58,7 @@ ) stop = { - f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 450.0, + f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 500.0, NUM_ENV_STEPS_SAMPLED_LIFETIME: 100000, } diff --git a/rllib/tuned_examples/dqn/multi_agent_cartpole_dqn.py b/rllib/tuned_examples/dqn/multi_agent_cartpole_dqn.py new file mode 100644 index 000000000000..f66b0eb1b35d --- /dev/null +++ b/rllib/tuned_examples/dqn/multi_agent_cartpole_dqn.py @@ -0,0 +1,66 @@ +from ray.rllib.algorithms.dqn import DQNConfig +from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + NUM_ENV_STEPS_SAMPLED_LIFETIME, +) +from ray.tune.registry import register_env + +from ray.rllib.utils.test_utils import add_rllib_example_script_args + +parser = add_rllib_example_script_args() +parser.set_defaults(num_agents=2) +# Use `parser` to add your own custom command line options to this script +# and (if needed) use their values to set up `config` below. +args = parser.parse_args() + +register_env( + "multi_agent_cartpole", + lambda _: MultiAgentCartPole({"num_agents": args.num_agents}), +) + +config = ( + DQNConfig() + .environment(env="multi_agent_cartpole") + .training( + # Settings identical to old stack. + train_batch_size_per_learner=32, + replay_buffer_config={ + "type": "MultiAgentEpisodeReplayBuffer", + "capacity": 50000, + }, + n_step=3, + double_q=True, + num_atoms=1, + noisy=False, + dueling=True, + ) + .rl_module( + model_config_dict={ + "fcnet_hiddens": [256], + "fcnet_activation": "tanh", + "epsilon": [(0, 1.0), (10000, 0.02)], + "fcnet_bias_initializer": "zeros_", + "post_fcnet_bias_initializer": "zeros_", + "post_fcnet_hiddens": [256], + }, + ) +) + +if args.num_agents: + config.multi_agent( + policy_mapping_fn=lambda aid, *arg, **kw: f"p{aid}", + policies={f"p{i}" for i in range(args.num_agents)}, + ) + +stop = { + NUM_ENV_STEPS_SAMPLED_LIFETIME: 500000, + # `episode_return_mean` is the sum of all agents/policies' returns. + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 250.0 * args.num_agents, +} + +if __name__ == "__main__": + from ray.rllib.utils.test_utils import run_rllib_example_script_experiment + + run_rllib_example_script_experiment(config, args, stop=stop) diff --git a/rllib/tuned_examples/sac/multi_agent_pendulum_sac.py b/rllib/tuned_examples/sac/multi_agent_pendulum_sac.py index a70ca0b9d62b..077ad94baec5 100644 --- a/rllib/tuned_examples/sac/multi_agent_pendulum_sac.py +++ b/rllib/tuned_examples/sac/multi_agent_pendulum_sac.py @@ -1,46 +1,31 @@ from ray.rllib.algorithms.sac import SACConfig from ray.rllib.examples.envs.classes.multi_agent import MultiAgentPendulum +from ray.rllib.utils.framework import try_import_torch from ray.rllib.utils.metrics import ( ENV_RUNNER_RESULTS, EPISODE_RETURN_MEAN, NUM_ENV_STEPS_SAMPLED_LIFETIME, ) +from ray.rllib.utils.test_utils import add_rllib_example_script_args from ray.tune.registry import register_env -from ray.rllib.utils.test_utils import add_rllib_example_script_args +torch, nn = try_import_torch() parser = add_rllib_example_script_args() +parser.set_defaults(num_agents=2) # Use `parser` to add your own custom command line options to this script # and (if needed) use their values to set up `config` below. args = parser.parse_args() register_env( "multi_agent_pendulum", - lambda _: MultiAgentPendulum({"num_agents": args.num_agents or 2}), + lambda _: MultiAgentPendulum({"num_agents": args.num_agents}), ) config = ( SACConfig() .environment(env="multi_agent_pendulum") - .rl_module( - model_config_dict={ - "fcnet_hiddens": [256, 256], - "fcnet_activation": "relu", - "post_fcnet_hiddens": [], - "post_fcnet_activation": None, - "post_fcnet_weights_initializer": "orthogonal_", - "post_fcnet_weights_initializer_config": {"gain": 0.01}, - } - ) - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) - .env_runners( - rollout_fragment_length=1, - num_env_runners=2, - num_envs_per_env_runner=1, - ) + .env_runners(num_env_runners=2) .training( initial_alpha=1.001, lr=3e-4, @@ -53,7 +38,18 @@ "type": "MultiAgentEpisodeReplayBuffer", "capacity": 100000, }, - num_steps_sampled_before_learning_starts=256, + num_steps_sampled_before_learning_starts=1024, + ) + .rl_module( + model_config_dict={ + "fcnet_hiddens": [256, 256], + "fcnet_activation": "tanh", + "fcnet_weights_initializer": nn.init.xavier_uniform_, + # "post_fcnet_hiddens": [], + # "post_fcnet_activation": None, + # "post_fcnet_weights_initializer": nn.init.orthogonal_, + # "post_fcnet_weights_initializer_config": {"gain": 0.01}, + } ) .reporting( metrics_num_episodes_for_smoothing=5, @@ -61,7 +57,7 @@ ) ) -if args.num_agents: +if args.num_agents > 0: config.multi_agent( policy_mapping_fn=lambda aid, *arg, **kw: f"p{aid}", policies={f"p{i}" for i in range(args.num_agents)}, @@ -70,7 +66,7 @@ stop = { NUM_ENV_STEPS_SAMPLED_LIFETIME: 500000, # `episode_return_mean` is the sum of all agents/policies' returns. - f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": -400.0 * (args.num_agents or 2), + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": -400.0 * args.num_agents, } if __name__ == "__main__": diff --git a/rllib/tuned_examples/sac/pendulum_sac.py b/rllib/tuned_examples/sac/pendulum_sac.py index b066c66a312d..a461f042e3c0 100644 --- a/rllib/tuned_examples/sac/pendulum_sac.py +++ b/rllib/tuned_examples/sac/pendulum_sac.py @@ -1,34 +1,17 @@ from ray.rllib.algorithms.sac.sac import SACConfig -from ray.rllib.utils.metrics import ( - ENV_RUNNER_RESULTS, - EPISODE_RETURN_MEAN, - NUM_ENV_STEPS_SAMPLED_LIFETIME, -) from ray.rllib.utils.test_utils import add_rllib_example_script_args -parser = add_rllib_example_script_args() +parser = add_rllib_example_script_args( + default_timesteps=20000, + default_reward=-250.0, +) # Use `parser` to add your own custom command line options to this script # and (if needed) use their values toset up `config` below. args = parser.parse_args() config = ( SACConfig() - # Enable new API stack and use EnvRunner. - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .environment(env="Pendulum-v1") - .rl_module( - model_config_dict={ - "fcnet_hiddens": [256, 256], - "fcnet_activation": "relu", - "post_fcnet_hiddens": [], - "post_fcnet_activation": None, - "post_fcnet_weights_initializer": "orthogonal_", - "post_fcnet_weights_initializer_config": {"gain": 0.01}, - } - ) .training( initial_alpha=1.001, lr=3e-4, @@ -45,19 +28,24 @@ }, num_steps_sampled_before_learning_starts=256, ) + .rl_module( + model_config_dict={ + "fcnet_hiddens": [256, 256], + "fcnet_activation": "relu", + "post_fcnet_hiddens": [], + "post_fcnet_activation": None, + "post_fcnet_weights_initializer": "orthogonal_", + "post_fcnet_weights_initializer_config": {"gain": 0.01}, + } + ) .reporting( metrics_num_episodes_for_smoothing=5, min_sample_timesteps_per_iteration=1000, ) ) -stop = { - NUM_ENV_STEPS_SAMPLED_LIFETIME: 20000, - f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": -250.0, -} - if __name__ == "__main__": from ray.rllib.utils.test_utils import run_rllib_example_script_experiment - run_rllib_example_script_experiment(config, args, stop=stop) + run_rllib_example_script_experiment(config, args) diff --git a/rllib/utils/replay_buffers/multi_agent_episode_replay_buffer.py b/rllib/utils/replay_buffers/multi_agent_episode_replay_buffer.py index 9b1af8e86cff..310791901f50 100644 --- a/rllib/utils/replay_buffers/multi_agent_episode_replay_buffer.py +++ b/rllib/utils/replay_buffers/multi_agent_episode_replay_buffer.py @@ -131,7 +131,7 @@ def add( """ episodes: List["MultiAgentEpisode"] = force_list(episodes) - new_episode_ids: List[str] = {eps.id_ for eps in episodes} + new_episode_ids: Set[str] = {eps.id_ for eps in episodes} total_env_timesteps = sum([eps.env_steps() for eps in episodes]) self._num_timesteps += total_env_timesteps self._num_timesteps_added += total_env_timesteps @@ -455,7 +455,7 @@ def _sample_independent( self, batch_size_B: Optional[int], batch_length_T: Optional[int], - n_step: Optional[Union[int, Tuple]], + n_step: Optional[Union[int, Tuple[int, int]]], gamma: float, include_infos: bool, include_extra_model_outputs: bool, @@ -465,7 +465,7 @@ def _sample_independent( actual_n_step = n_step or 1 # Sample the n-step if necessary. - random_n_step = isinstance(n_step, tuple) + random_n_step = isinstance(n_step, (tuple, list)) sampled_episodes = [] # TODO (simon): Ensure that the module has data and if not, skip it. @@ -828,49 +828,44 @@ def _update_module_counters(self, ma_episode: MultiAgentEpisode) -> None: # Only add if the agent has stepped in the episode (chunk). if agent_steps > 0: # Receive the corresponding module ID. - # TODO (sven, simon): Is there always a mapping? What if not? - # Is then module_id == agent_id? - module_id = ma_episode._agent_to_module_mapping[agent_id] + module_id = ma_episode.module_for(agent_id) self._num_module_timesteps[module_id] += agent_steps self._num_module_timesteps_added[module_id] += agent_steps # Also add to the module episode counter. self._num_module_episodes[module_id] += 1 def _add_new_module_indices( - self, ma_episode: MultiAgentEpisode, episode_idx: int, exists: bool = True + self, + ma_episode: MultiAgentEpisode, + episode_idx: int, + ma_episode_exists: bool = True, ) -> None: """Adds the module indices for new episode chunks. Args: - multi_agent_episode: The multi-agent episode to add the module indices for. + ma_episode: The multi-agent episode to add the module indices for. episode_idx: The index of the episode in the `self.episodes`. + ma_episode_exists: Whether `ma_episode` is already in this buffer (with a + predecessor chunk to which we'll concatenate `ma_episode` later). """ + existing_ma_episode = None + if ma_episode_exists: + existing_ma_episode = self.episodes[ + self.episode_id_to_index[ma_episode.id_] - self._num_episodes_evicted + ] for agent_id in ma_episode.agent_ids: + existing_sa_eps_len = 0 + # Get the corresponding module id. - module_id = ma_episode._agent_to_module_mapping[agent_id] + module_id = ma_episode.module_for(agent_id) # Get the module episode. module_eps = ma_episode.agent_episodes[agent_id] - # Check if the module episode is already in the buffer. - if exists: - old_ma_episode = self.episodes[ - self.episode_id_to_index[ma_episode.id_] - - self._num_episodes_evicted - ] - # Is the agent episode already in the buffer? - sa_episode_in_buffer = agent_id in old_ma_episode.agent_episodes - else: - # This agent episode is new. The agent might have just entered - # the environment. - sa_episode_in_buffer = False - if sa_episode_in_buffer: - existing_eps_len = len( - self.episodes[ - episode_idx - self._num_episodes_evicted - ].agent_episodes[agent_id] - ) - else: - existing_eps_len = 0 + + # Is the agent episode already in the buffer's existing `ma_episode`? + if ma_episode_exists and agent_id in existing_ma_episode.agent_episodes: + existing_sa_eps_len = len(existing_ma_episode.agent_episodes[agent_id]) + # Add new module indices. self._module_to_indices[module_id].extend( [ @@ -878,7 +873,7 @@ def _add_new_module_indices( # Keep the MAE index for sampling episode_idx, agent_id, - existing_eps_len + i, + existing_sa_eps_len + i, ) for i in range(len(module_eps)) ] From 8d14ed8b844ab3fa914d754d721982efcaad95e4 Mon Sep 17 00:00:00 2001 From: Alan Guo Date: Tue, 28 May 2024 18:52:30 -0700 Subject: [PATCH 35/65] fix node detail page grid being hard to select (#45608) There are no visual changes, but this makes the "size" of the box match the size of the text. Previously, the box was larger than the text so each box would disrupt trying to select text from another box. --- dashboard/client/src/pages/node/NodeDetail.tsx | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/dashboard/client/src/pages/node/NodeDetail.tsx b/dashboard/client/src/pages/node/NodeDetail.tsx index b27931ae9eb7..b6de03a7ae19 100644 --- a/dashboard/client/src/pages/node/NodeDetail.tsx +++ b/dashboard/client/src/pages/node/NodeDetail.tsx @@ -92,7 +92,7 @@ const NodeDetailPage = () => { {nodeDetail && selectedTab === "info" && (
- +
Hostname
{" "} {nodeDetail.hostname} @@ -101,7 +101,7 @@ const NodeDetailPage = () => {
IP
{nodeDetail.ip}
- + {nodeDetail.cpus && ( @@ -118,7 +118,7 @@ const NodeDetailPage = () => { .join("/")} - +
Load per CPU (1/5/15min)
{" "} {nodeDetail?.loadAvg[1] && @@ -131,7 +131,7 @@ const NodeDetailPage = () => { {formatDateFromTimeMs(nodeDetail.bootTime * 1000)}
- +
Sent Tps
{" "} {memoryConverter(nodeDetail?.networkSpeed[0])}/s @@ -141,7 +141,7 @@ const NodeDetailPage = () => { {memoryConverter(nodeDetail?.networkSpeed[1])}/s
- +
Memory
{" "} {nodeDetail?.mem && ( @@ -161,7 +161,7 @@ const NodeDetailPage = () => {
- + {nodeDetail?.disk && Object.entries(nodeDetail?.disk).map(([path, obj]) => ( @@ -175,7 +175,7 @@ const NodeDetailPage = () => { ))} - +
Logs
{" "} { {raylet && Object.keys(raylet).length > 0 && selectedTab === "raylet" && (
- +
Command

@@ -201,7 +201,7 @@ const NodeDetailPage = () => {
- +
Pid
{raylet?.pid}
From 5d056f35a58956ca5b3669d61880e3cc530372be Mon Sep 17 00:00:00 2001 From: Cuong Nguyen <128072568+can-anyscale@users.noreply.github.com> Date: Tue, 28 May 2024 19:07:55 -0700 Subject: [PATCH 36/65] [ci] upgrade rayci version (#45607) Upgrade rayci version; this version enable build notification for microcheck build failures Test: - CI Signed-off-by: can --- .rayciversion | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.rayciversion b/.rayciversion index a918a2aa18d5..faef31a4357c 100644 --- a/.rayciversion +++ b/.rayciversion @@ -1 +1 @@ -0.6.0 +0.7.0 From 040b736e5fc4f731a3ffd971048e9bd66984cfe2 Mon Sep 17 00:00:00 2001 From: harborn Date: Wed, 29 May 2024 11:20:12 +0800 Subject: [PATCH 37/65] [Train] Add example of fine-tuning Llama-2 on Intel Gaudi (#44667) Adds an example for fine-tuning Llama-2-7b/70b on multiple HPUs. --------- Signed-off-by: Wu, Gangsheng --- doc/source/train/examples.yml | 10 + .../train/examples/intel_gaudi/llama.ipynb | 592 ++++++++++++++++++ 2 files changed, 602 insertions(+) create mode 100644 doc/source/train/examples/intel_gaudi/llama.ipynb diff --git a/doc/source/train/examples.yml b/doc/source/train/examples.yml index 3130c590c911..1a828ddc1714 100644 --- a/doc/source/train/examples.yml +++ b/doc/source/train/examples.yml @@ -87,6 +87,16 @@ examples: use_cases: - natural language processing link: examples/transformers/huggingface_text_classification + - title: "Fine-tune Llama-2-7b and Llama-2-70b with Intel Gaudi" + frameworks: + - accelerate + - transformers + skill_level: intermediate + use_cases: + - natural language processing + - large language models + contributor: community + link: examples/intel_gaudi/llama - title: Fine-tune a Llama-2 text generation models with DeepSpeed and Hugging Face Accelerate frameworks: diff --git a/doc/source/train/examples/intel_gaudi/llama.ipynb b/doc/source/train/examples/intel_gaudi/llama.ipynb new file mode 100644 index 000000000000..ad763c9dce1f --- /dev/null +++ b/doc/source/train/examples/intel_gaudi/llama.ipynb @@ -0,0 +1,592 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Fine-tuning Llama-2 Model with HPU\n", + "\n", + "In this Jupyter notebook, we will:\n", + "- fine-tuning a [Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) model by using HPU with DDP method\n", + "- fine-tuning a [Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) model by using HPU with DeepSpeed method\n", + "\n", + "We will use PyTorch for model training and Ray for distributed training. We will use dataset [tatsu-lab/alpaca](https://huggingface.co/datasets/tatsu-lab/alpaca).\n", + "\n", + "[Habana Gaudi AI Processors (HPUs)](https://habana.ai) are AI hardware accelerators designed by Habana Labs. For more information, see [Gaudi Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/index.html) and [Gaudi Developer Docs](https://developer.habana.ai/).\n", + "\n", + "Basic features for this fine-tuning example are:\n", + "- Running on HPUs, support three execution mode: [\"lazy\", \"eager\", \"eager.compile\"](https://docs.habana.ai/en/latest/PyTorch/Reference/PyTorch_Gaudi_Theory_of_Operations.html).\n", + "- LoRA training.\n", + "- DDP or DeepSpeed based method.\n", + "- [`GaudiTrainer`](https://github.com/huggingface/optimum-habana/blob/main/optimum/habana/transformers/trainer.py) based training.\n", + "- Llama-2-7b/Llama-2-70b model.\n", + "- Ray based resource scheduling and management." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare environment\n", + "This example run on single node with 4 HPUs.\n", + "\n", + "We recommend using a prebuilt container to run these examples. To run a container, you need Docker. See [Install Docker Engine](https://docs.docker.com/engine/install/) for installation instructions.\n", + "\n", + "Next, follow [Run Using Containers](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html?highlight=installer#run-using-containers) to install the Habana drivers and container runtime.\n", + "\n", + "### Get docker image\n", + "``` bash\n", + "docker pull vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest\n", + "```\n", + "### Run docker image\n", + "``` bash\n", + "docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest\n", + "# maybe should mapping your workspace volumns\n", + "```\n", + "### Install dependency\n", + "``` bash\n", + "# \"optimum-habana>1.11.1\" if exection mode \"eager\" or \"eager.compile\" \n", + "# \"ray>=2.20.0\"\n", + "pip install ray[train] notebook transformers datasets evaluate peft accelerate scikit-learn optimum-habana\n", + "\n", + "# install deepspeed\n", + "pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0\n", + "\n", + "# this notebook verfied with packages' version:\n", + "# transformers==4.38.2\n", + "# datasets==2.19.1\n", + "# evaluate==0.4.2\n", + "# peft==0.4.0\n", + "# accelerate==0.27.2\n", + "# scikit-learn==1.4.2\n", + "# optimum-habana==1.11.1\n", + "\n", + "# deepspeed==0.12.4+hpu.synapse.v1.15.0\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import necessary libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import copy\n", + "from typing import Dict\n", + "\n", + "import torch\n", + "\n", + "import datasets\n", + "import transformers\n", + "from transformers import DataCollatorForLanguageModeling\n", + "\n", + "from tqdm import tqdm\n", + "\n", + "import peft\n", + "\n", + "from optimum.habana import GaudiTrainer, GaudiConfig, GaudiTrainingArguments\n", + "from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare Dataset Function\n", + "\n", + "Preprocessing the raw dataset's each line with specified format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def preprocess_dataset(raw_datasets):\n", + "\n", + " PROMPT_DICT = {\n", + " \"prompt_with_input\": (\n", + " \"Below is an instruction that describes a task, paired with an input that provides further context. \"\n", + " \"Write a response that appropriately completes the request.\\n\\n\"\n", + " \"### Instruction:\\n{instruction}\\n\\n### Input:\\n{input}\\n\\n### Response:\"\n", + " ),\n", + " \"prompt_without_input\": (\n", + " \"Below is an instruction that describes a task. \"\n", + " \"Write a response that appropriately completes the request.\\n\\n\"\n", + " \"### Instruction:\\n{instruction}\\n\\n### Response:\"\n", + " ),\n", + " }\n", + "\n", + " def create_prompts(examples):\n", + " prompts = {}\n", + " prompts[\"source\"] = []\n", + " prompts[\"target\"] = []\n", + " for example in examples:\n", + " prompt_template = (\n", + " PROMPT_DICT[\"prompt_with_input\"] if example[\"input\"] != \"\" else PROMPT_DICT[\"prompt_without_input\"]\n", + " )\n", + " source = prompt_template.format_map(example)\n", + " prompts[\"source\"].append(source)\n", + " prompts[\"target\"].append(example[\"output\"])\n", + " return prompts\n", + "\n", + " # Preprocessing the datasets.\n", + " for key in raw_datasets:\n", + " prompts = create_prompts(raw_datasets[key])\n", + " columns_to_be_removed = list(raw_datasets[key].features.keys())\n", + " raw_datasets[key] = raw_datasets[key].add_column(\"prompt_sources\", prompts[\"source\"])\n", + " raw_datasets[key] = raw_datasets[key].add_column(\"prompt_targets\", prompts[\"target\"])\n", + " raw_datasets[key] = raw_datasets[key].remove_columns(columns_to_be_removed)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dataset to Tokenizer Function\n", + "\n", + "Tokenize each line in dataset by model tokenizer.\n", + "\n", + "In example codes, we concatenate the dataset's line content to accelerate training speed.\n", + "\n", + "All datasets are processed as \"train\" datasets, no evaluation datasets are sampled from raw_datasets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def preprocess_dataset_to_tokenizer(raw_datasets, tokenizer):\n", + " max_seq_length = 512\n", + " tokenizer.pad_token_id = 0\n", + " tokenizer.eos_token_id = 1\n", + " tokenizer.bos_token_id = 2\n", + "\n", + " def tokenize(prompt, add_eos_token=True):\n", + " results = tokenizer(\n", + " prompt,\n", + " truncation=True,\n", + " max_length=max_seq_length,\n", + " padding=False,\n", + " return_tensors=None,\n", + " )\n", + " for i in range(len(results[\"input_ids\"])):\n", + " if (\n", + " results[\"input_ids\"][i][-1] != tokenizer.eos_token_id\n", + " and len(results[\"input_ids\"][i]) < max_seq_length\n", + " and add_eos_token\n", + " ):\n", + " results[\"input_ids\"][i].append(tokenizer.eos_token_id)\n", + " results[\"attention_mask\"][i].append(1)\n", + "\n", + " results[\"labels\"] = copy.deepcopy(results[\"input_ids\"])\n", + " results[\"input_id_len\"] = [len(result) for result in results[\"input_ids\"]]\n", + " return results\n", + "\n", + " def preprocess_function(examples):\n", + " keys = list(examples.data.keys())\n", + " if len(keys) != 2:\n", + " raise ValueError(\"Unsupported dataset format\")\n", + "\n", + " st = [s + t for s, t in zip(examples[keys[0]], examples[keys[1]])]\n", + "\n", + " examples_tokenized = tokenize(st)\n", + " input_ids = examples_tokenized[\"input_ids\"]\n", + " labels = examples_tokenized[\"labels\"]\n", + " return {\n", + " \"input_ids\": input_ids,\n", + " \"labels\": labels,\n", + " \"attention_mask\": examples_tokenized[\"attention_mask\"],\n", + " }\n", + "\n", + " tokenized_datasets = raw_datasets.map(\n", + " preprocess_function,\n", + " batched=True,\n", + " load_from_cache_file=True,\n", + " )\n", + "\n", + " def concatenate_data(dataset, max_seq_length):\n", + " concatenated_dataset = {}\n", + " for column in dataset.features:\n", + " concatenated_data = [item for sample in dataset[column] for item in sample]\n", + " reshaped_data = [\n", + " concatenated_data[i * max_seq_length : (i + 1) * max_seq_length]\n", + " for i in range(len(concatenated_data) // max_seq_length)\n", + " ]\n", + " concatenated_dataset[column] = reshaped_data\n", + " return datasets.Dataset.from_dict(concatenated_dataset)\n", + "\n", + " tokenized_datasets_ = tokenized_datasets[\"train\"].remove_columns([\"prompt_sources\", \"prompt_targets\"])\n", + " tokenized_datasets[\"train\"] = concatenate_data(tokenized_datasets_, max_seq_length)\n", + "\n", + " return tokenized_datasets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare training arguments\n", + "\n", + "here some arguments are hard coded, you can pass arguments from `config`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def prepare_training_args(config: Dict):\n", + " # prepare execution mode config\n", + " execution_mode = config[\"execution_mode\"]\n", + " use_lazy_mode = True if execution_mode == \"lazy\" else False\n", + " torch_compile_backend = \"hpu_backend\" if execution_mode == \"eager.compile\" else None\n", + "\n", + " deepspeed = config[\"deepspeed\"] if \"deepspeed\" in config else None\n", + "\n", + " return GaudiTrainingArguments(deepspeed=deepspeed,\n", + " output_dir=config[\"output\"],\n", + " do_train=True,\n", + " do_eval=False,\n", + " per_device_train_batch_size=config[\"batch_size_per_worker\"],\n", + " bf16=True,\n", + " learning_rate=config[\"lr\"],\n", + " save_strategy=\"no\",\n", + " torch_compile_backend=torch_compile_backend,\n", + " evaluation_strategy=\"no\",\n", + " lr_scheduler_type=\"cosine\",\n", + " num_train_epochs=config[\"epochs\"],\n", + " use_lazy_mode=use_lazy_mode,\n", + " use_habana=True,\n", + " pipelining_fwd_bwd=True,\n", + " save_only_model=True,\n", + " gradient_checkpointing=True,\n", + " warmup_ratio=0.03,\n", + " throughput_warmup_steps=3,\n", + " logging_steps=5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare model\n", + "\n", + "1. download model from huggingface or read model from local directory.\n", + "2. convert model to lora model.\n", + "3. move model to HPU device.\n", + "\n", + "If you doesn't want to fine-tune with LoRA, just remove LoRA conversion step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def prepare_model(config: Dict, device):\n", + " # prepare from pretrained model\n", + " deepspeed = config[\"deepspeed\"] if \"deepspeed\" in config else None\n", + " if deepspeed is not None:\n", + " auto_config = transformers.AutoConfig.from_pretrained(config[\"model\"], use_cache=False, revision=\"main\", use_auth_token=None, trust_remote_code=None)\n", + " model = transformers.AutoModelForCausalLM.from_pretrained(config[\"model\"], config=auto_config, **config[\"model_config\"])\n", + " model.generation_config.attn_softmax_bf16 = True\n", + " model.generation_config.use_flash_attention = True\n", + " else:\n", + " model = transformers.AutoModelForCausalLM.from_pretrained(config[\"model\"], **config[\"model_config\"])\n", + " model.enable_input_require_grads()\n", + "\n", + " # convert to peft model for lora training\n", + " peft_config = peft.LoraConfig(**config[\"lora_config\"])\n", + " model = peft.get_peft_model(model, peft_config)\n", + "\n", + " model.to(dtype=config[\"model_config\"][\"torch_dtype\"], device=device)\n", + "\n", + " return model\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training Function\n", + "\n", + "This function will be executed by each worker during training, with following steps:\n", + "\n", + "- preparing training args, an instance of `GaudiTrainingArguments`.\n", + "- loading datasets and preprocess datasets, just load the first 4096 item as training datasets.\n", + "- loading pretrained model as tokenizer, and process datasets to tokenizer.\n", + "- loading pretrained model.\n", + "- preparing data collator and gaidu_config.\n", + "- preparing instance of `GaudiTrainer`.\n", + "- calling `train()` to train model.\n", + "- saving model results.\n", + "\n", + "Compared to a training function for GPU, no changes are needed to port to HPU. Internally, Ray Train does these things:\n", + "\n", + "- Detect HPU and set the device.\n", + "- Initialize the habana PyTorch backend.\n", + "- Initialize the habana distributed backend." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def train_func_per_worker(config: Dict):\n", + " # adapt transformers to gaudi\n", + " adapt_transformers_to_gaudi()\n", + "\n", + " # prepare training arguments\n", + " training_args = prepare_training_args(config)\n", + "\n", + " # prepare datasets\n", + " # here we use dataset \"tatsu-lab/alpaca\" from huggingface\n", + " raw_datasets = datasets.DatasetDict({\"train\": datasets.load_dataset(\"tatsu-lab/alpaca\", split='train[0:4096]')})\n", + " preprocess_dataset(raw_datasets)\n", + "\n", + " # prepare tokenizer\n", + " tokenizer = transformers.AutoTokenizer.from_pretrained(config[\"model\"])\n", + " tokenized_datasets = preprocess_dataset_to_tokenizer(raw_datasets, tokenizer)\n", + "\n", + " # prepare model\n", + " model = prepare_model(config, training_args.device)\n", + "\n", + " # prepare data collator\n", + " data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", mlm=False)\n", + "\n", + " # prepare gaudi config\n", + " gaudi_config = GaudiConfig()\n", + " gaudi_config.use_fused_adam = True\n", + " gaudi_config.use_fused_clip_norm = True\n", + "\n", + " # instance GaudiTrainer\n", + " trainer = GaudiTrainer(\n", + " model=model,\n", + " gaudi_config=gaudi_config,\n", + " args=training_args,\n", + " train_dataset=tokenized_datasets[\"train\"],\n", + " eval_dataset=None,\n", + " tokenizer=tokenizer,\n", + " data_collator=data_collator,\n", + " compute_metrics=None,\n", + " preprocess_logits_for_metrics=None,\n", + " )\n", + "\n", + " train_result = trainer.train()\n", + " print(f\"train_result = {train_result}\")\n", + " trainer.save_model()\n", + "\n", + " return train_result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Main Training Function\n", + "The `train_llama` function sets up the distributed training environment using Ray and starts the training process. To enable training using HPU, we only need to make the following changes:\n", + "- Set the exectuion mode for training, supported execution mode are:\n", + "\n", + " - \"lazy\": Deferred execution of graphs, comprising of ops delivered from script op by op similar to Eager mode. It gives the Eager mode experience with performance on Gaudi. Unlike Eager Mode with torch.compile, graph is analyzed in each iteration leading to a higher CPU usage.\n", + " - \"eager\": Op-by-op execution as defined in standard PyTorch Eager mode scripts.\n", + " - \"eager.compile\": Eager mode extended with `torch.compile` - Similar to Eager mode but extended with wrapping complete or part of model (such as a function) into a graph. Parts that are not wrapped are executed eagerly.\n", + "\n", + " More detail theory can be found [here](https://docs.habana.ai/en/latest/PyTorch/Reference/PyTorch_Gaudi_Theory_of_Operations.html), and detail performance results can be found [here](https://developer.habana.ai/get-started/habana-models-performance/)\n", + "- Set training method, supported method are:\n", + " - \"ddp\"\n", + " - \"deepspeed\"\n", + "- Require an HPU for each worker in ScalingConfig\n", + "- Set backend to `hccl` in TorchConfig" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def train_llama(num_workers, execution_mode, training_method):\n", + " import ray\n", + " from ray.train import ScalingConfig\n", + " from ray.train.torch import TorchTrainer, TorchConfig\n", + "\n", + " # deepspeed config, can also place it to config file\n", + " deepspeed_config = {\n", + " \"steps_per_print\": 64,\n", + " \"train_batch_size\": \"auto\",\n", + " \"train_micro_batch_size_per_gpu\": \"auto\",\n", + " \"gradient_accumulation_steps\": \"auto\",\n", + " \"bf16\": {\n", + " \"enabled\": True\n", + " },\n", + " \"gradient_clipping\": 1.0,\n", + " \"zero_optimization\": {\n", + " \"stage\": 3,\n", + " \"overlap_comm\": False,\n", + " \"contiguous_gradients\": False,\n", + " \"stage3_gather_16bit_weights_on_model_save\": True\n", + " }\n", + " }\n", + "\n", + " # Preparing train configurations\n", + " train_config = {\n", + " \"execution_mode\": execution_mode,\n", + " \"model\": \"/root/models/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080/\",\n", + " \"model_config\": {\"torch_dtype\": torch.bfloat16, \"trust_remote_code\": False, \"use_auth_token\": None},\n", + " \"lora_config\": {\"task_type\": \"CAUSAL_LM\", \"r\": 8, \"lora_alpha\": 32, \"lora_dropout\": 0.1, \"target_modules\": [\"q_proj\", \"v_proj\"]},\n", + " \"lr\": 1e-4,\n", + " \"epochs\": 2,\n", + " \"batch_size_per_worker\": 8,\n", + " \"output\": \"/tmp/ray/\",\n", + " \"deepspeed\": deepspeed_config if training_method == \"deepspeed\" else None,\n", + " }\n", + "\n", + " # Configure computation resources\n", + " # In ScalingConfig, require an HPU for each worker\n", + " scaling_config = ScalingConfig(num_workers=num_workers, resources_per_worker={\"CPU\": 1, \"HPU\": 1})\n", + " # Set backend to hccl in TorchConfig\n", + " torch_config = TorchConfig(backend = \"hccl\")\n", + "\n", + " # start your ray cluster\n", + " ray.init()\n", + "\n", + " # Initialize a Ray TorchTrainer\n", + " trainer = TorchTrainer(\n", + " train_loop_per_worker=train_func_per_worker,\n", + " train_loop_config=train_config,\n", + " torch_config=torch_config,\n", + " scaling_config=scaling_config,\n", + " )\n", + "\n", + " result = trainer.fit()\n", + " print(f\"Training result: {result}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Start Training\n", + "\n", + "Finally, we call the `train_llama` function to start the training process. You can adjust the number of workers to use, and the execution mode for HPU." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# set some environment variables\n", + "os.environ[\"RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES\"] = \"0\"\n", + "# if using RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES env var\n", + "# you must set HABANA_VISIBLE_DEVICES, such as\n", + "# os.environ[\"HABANA_VISIBLE_DEVICES\"] = \"0,1,2,3\"\n", + "\n", + "# execution_mode are [\"lazy\", \"eager\", \"eager.compile\"]\n", + "execution_mode = \"lazy\"\n", + "os.environ[\"PT_HPU_LAZY_MODE\"] = \"1\" if execution_mode == \"lazy\" else \"0\"\n", + "\n", + "# training_method are [\"ddp\", \"deepspeed\"]\n", + "training_method = \"deepspeed\"\n", + "if training_method == \"deepspeed\":\n", + " os.environ[\"PT_HPU_MAX_COMPOUND_OP_SIZE\"] = \"10\"\n", + " os.environ[\"DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED\"] = \"1\"\n", + "\n", + "# here use 4 HPUs\n", + "train_llama(num_workers=4, execution_mode=execution_mode, training_method=training_method)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Final output\n", + "\n", + "### For DDP on HPUs\n", + "- Llama-2-70b-chat-hf\n", + "- 4 HPU\n", + "- LoRA\n", + "\n", + "``` bash\n", + "(RayTrainWorker pid=123181) {'loss': 1.8051, 'grad_norm': 0.6015625, 'learning_rate': 9.938441702975689e-05, 'epoch': 0.16, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}\n", + "(RayTrainWorker pid=123181) {'loss': 1.6754, 'grad_norm': 0.408203125, 'learning_rate': 9.567727288213005e-05, 'epoch': 0.32, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}\n", + "(RayTrainWorker pid=123181) {'loss': 1.568, 'grad_norm': 0.4453125, 'learning_rate': 8.885729807284856e-05, 'epoch': 0.48, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}\n", + "(RayTrainWorker pid=123181) {'loss': 1.4934, 'grad_norm': 0.4609375, 'learning_rate': 7.938926261462366e-05, 'epoch': 0.65, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}\n", + "(RayTrainWorker pid=123181) {'loss': 1.3965, 'grad_norm': 0.3515625, 'learning_rate': 6.7918397477265e-05, 'epoch': 0.81, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}\n", + "(RayTrainWorker pid=123181) {'loss': 1.3461, 'grad_norm': 0.34765625, 'learning_rate': 5.522642316338268e-05, 'epoch': 0.97, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}\n", + "(RayTrainWorker pid=123181) {'loss': 1.2924, 'grad_norm': 0.32421875, 'learning_rate': 4.2178276747988446e-05, 'epoch': 1.13, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}\n", + "(RayTrainWorker pid=123181) {'loss': 1.2643, 'grad_norm': 0.33203125, 'learning_rate': 2.9663167846209998e-05, 'epoch': 1.29, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}\n", + "(RayTrainWorker pid=123181) {'loss': 1.263, 'grad_norm': 0.318359375, 'learning_rate': 1.8533980447508137e-05, 'epoch': 1.45, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}\n", + "(RayTrainWorker pid=123181) {'loss': 1.2502, 'grad_norm': 0.275390625, 'learning_rate': 9.549150281252633e-06, 'epoch': 1.61, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}\n", + "(RayTrainWorker pid=123181) {'loss': 1.2161, 'grad_norm': 0.2734375, 'learning_rate': 3.3209786751399187e-06, 'epoch': 1.77, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}\n", + "(RayTrainWorker pid=123181) {'loss': 1.2517, 'grad_norm': 0.294921875, 'learning_rate': 2.7390523158633554e-07, 'epoch': 1.94, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}\n", + "```\n", + "\n", + "### For DeepSpeed on HPUs\n", + "- Llama-2-70b-chat-hf\n", + "- 4 HPU\n", + "- LoRA\n", + "\n", + "``` bash\n", + "(RayTrainWorker pid=110856) {'loss': 1.6627, 'grad_norm': 0.35921376943588257, 'learning_rate': 9.938441702975689e-05, 'epoch': 0.16, 'memory_allocated (GB)': 32.88, 'max_memory_allocated (GB)': 43.56, 'total_memory_available (GB)': 94.62}\n", + "(RayTrainWorker pid=110856) {'loss': 1.6085, 'grad_norm': 0.35271379351615906, 'learning_rate': 9.567727288213005e-05, 'epoch': 0.32, 'memory_allocated (GB)': 32.88, 'max_memory_allocated (GB)': 43.56, 'total_memory_available (GB)': 94.62}\n", + "(RayTrainWorker pid=110856) {'loss': 1.5051, 'grad_norm': 0.4277978837490082, 'learning_rate': 8.885729807284856e-05, 'epoch': 0.48, 'memory_allocated (GB)': 32.88, 'max_memory_allocated (GB)': 43.56, 'total_memory_available (GB)': 94.62}\n", + "(RayTrainWorker pid=110856) {'loss': 1.4157, 'grad_norm': 0.5138524770736694, 'learning_rate': 7.938926261462366e-05, 'epoch': 0.65, 'memory_allocated (GB)': 32.88, 'max_memory_allocated (GB)': 43.56, 'total_memory_available (GB)': 94.62}\n", + "(RayTrainWorker pid=110856) {'loss': 1.3233, 'grad_norm': 0.3451262414455414, 'learning_rate': 6.7918397477265e-05, 'epoch': 0.81, 'memory_allocated (GB)': 32.88, 'max_memory_allocated (GB)': 43.56, 'total_memory_available (GB)': 94.62}\n", + "(RayTrainWorker pid=110856) {'loss': 1.2728, 'grad_norm': 0.38564223051071167, 'learning_rate': 5.522642316338268e-05, 'epoch': 0.97, 'memory_allocated (GB)': 32.88, 'max_memory_allocated (GB)': 43.56, 'total_memory_available (GB)': 94.62}\n", + "(RayTrainWorker pid=110856) {'loss': 1.1989, 'grad_norm': 0.36078131198883057, 'learning_rate': 4.2178276747988446e-05, 'epoch': 1.13, 'memory_allocated (GB)': 32.88, 'max_memory_allocated (GB)': 43.56, 'total_memory_available (GB)': 94.62}\n", + "(RayTrainWorker pid=110856) {'loss': 1.1552, 'grad_norm': 0.47946077585220337, 'learning_rate': 2.9663167846209998e-05, 'epoch': 1.29, 'memory_allocated (GB)': 32.88, 'max_memory_allocated (GB)': 43.56, 'total_memory_available (GB)': 94.62}\n", + "(RayTrainWorker pid=110856) {'loss': 1.1413, 'grad_norm': 0.3357600271701813, 'learning_rate': 1.8533980447508137e-05, 'epoch': 1.45, 'memory_allocated (GB)': 32.88, 'max_memory_allocated (GB)': 43.56, 'total_memory_available (GB)': 94.62}\n", + "(RayTrainWorker pid=110856) {'loss': 1.129, 'grad_norm': 0.2777070701122284, 'learning_rate': 9.549150281252633e-06, 'epoch': 1.61, 'memory_allocated (GB)': 32.88, 'max_memory_allocated (GB)': 43.56, 'total_memory_available (GB)': 94.62}\n", + "(RayTrainWorker pid=110856) {'loss': 1.0876, 'grad_norm': 0.25669950246810913, 'learning_rate': 3.3209786751399187e-06, 'epoch': 1.77, 'memory_allocated (GB)': 32.88, 'max_memory_allocated (GB)': 43.56, 'total_memory_available (GB)': 94.62}\n", + "(RayTrainWorker pid=110856) {'loss': 1.1238, 'grad_norm': 0.2423330545425415, 'learning_rate': 2.7390523158633554e-07, 'epoch': 1.94, 'memory_allocated (GB)': 32.88, 'max_memory_allocated (GB)': 43.56, 'total_memory_available (GB)': 94.62}\n", + "```" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "orphan": true, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 0be0639f37864e2671c4069ee11becb4110b6a85 Mon Sep 17 00:00:00 2001 From: Shubham Gandhi Date: Wed, 29 May 2024 12:38:56 +0530 Subject: [PATCH 38/65] Documentation/improve experimental ray serve advanced guide (#45046) ## Why are these changes needed? Update the experimental feature guide on multi-container deployment approach for Ray Serve. ## Related issue number Closes: #45026 ## Checks - [x] I've signed off every commit(by using the -s flag, i.e., `git commit -s`) in this PR. - [ ] I've run `scripts/format.sh` to lint the changes in this PR. - [ ] I've included any doc changes needed for https://docs.ray.io/en/master/. - [ ] I've added any new APIs to the API Reference. For example, if I added a method in Tune, I've added it in `doc/source/tune/api/` under the corresponding `.rst` file. - [ ] I've made sure the tests are passing. Note that there might be a few flaky tests, see the recent failures at https://flakey-tests.ray.io/ - Testing Strategy - [ ] Unit tests - [ ] Release tests - [ ] This PR is not tested :( --------- Signed-off-by: dudeperf3ct --- .../advanced-guides/multi-app-container.md | 28 +++++++++++-------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/doc/source/serve/advanced-guides/multi-app-container.md b/doc/source/serve/advanced-guides/multi-app-container.md index a7658c906981..8e1bb208409a 100644 --- a/doc/source/serve/advanced-guides/multi-app-container.md +++ b/doc/source/serve/advanced-guides/multi-app-container.md @@ -44,6 +44,9 @@ RUN sudo apt-get update && sudo apt-get install curl -y # Download the source code for the Whisper application into `whisper_example.py`. RUN curl -O https://raw.githubusercontent.com/ray-project/ray/master/doc/source/serve/doc_code/whisper_example.py + +# Add /home/ray path to PYTHONPATH avoid import module error +ENV PYTHONPATH "${PYTHONPATH}:/home/ray" ``` ::: :::{tab-item} resnet.Dockerfile @@ -57,39 +60,42 @@ RUN sudo apt-get update && sudo apt-get install curl -y # Download the source code for the ResNet application into `resnet50_example.py`. RUN curl -O https://raw.githubusercontent.com/ray-project/ray/master/doc/source/serve/doc_code/resnet50_example.py + +# Add /home/ray path to PYTHONPATH avoid import module error +ENV PYTHONPATH "${PYTHONPATH}:/home/ray" ``` ::: :::: -Then, build the corresponding Docker images and push it to your choice of Docker registry. This tutorial uses `alice/whisper_image:latest` and `alice/resnet_image:latest` as placeholder names for the images, but make sure to swap out `alice` for a repo name of your choice. +Then, build the corresponding container images and push it to your choice of container registry. This tutorial uses `alice/whisper_image:latest` and `alice/resnet_image:latest` as placeholder names for the images, but make sure to swap out `alice` for a repo name of your choice. ::::{tab-set} :::{tab-item} Whisper ```bash -# Build the Docker image from the Dockerfile +# Build the container image from the Dockerfile using podman export IMG1=alice/whisper_image:latest -docker build -t $IMG1 -f whisper.Dockerfile . -# Push to a Docker registry. This step is unnecessary if you are deploying Serve locally. -docker push $IMG1 +podman build -t $IMG1 -f whisper.Dockerfile . +# Push to a registry. This step is unnecessary if you are deploying Serve locally. +podman push $IMG1 ``` ::: :::{tab-item} Resnet ```bash -# Build the Docker image from the Dockerfile +# Build the container image from the Dockerfile using podman export IMG2=alice/resnet_image:latest -docker build -t $IMG2 -f resnet.Dockerfile . -# Push to a Docker registry. This step is unnecessary if you are deploying Serve locally. -docker push $IMG2 +podman build -t $IMG2 -f resnet.Dockerfile . +# Push to a registry. This step is unnecessary if you are deploying Serve locally. +podman push $IMG2 ``` ::: :::: -Finally, you can specify the Docker image within which you want to run each application in the `container` field of an application's runtime environment specification. The `container` field has three fields: +Finally, you can specify the container image within which you want to run each application in the `container` field of an application's runtime environment specification. The `container` field has three fields: - `image`: (Required) The image to run your application in. - `worker_path`: The absolute path to `default_worker.py` inside the container. - `run_options`: Additional options to pass to the `podman run` command used to start a Serve deployment replica in a container. See [podman run documentation](https://docs.podman.io/en/latest/markdown/podman-run.1.html) for a list of all options. If you are familiar with `docker run`, most options work the same way. -The following Serve config runs the `whisper` app with the image `IMG1`, and the `resnet` app with the image `IMG2`. Concretely, all deployment replicas in the applications start and run in containers with the respective images. +The following Serve config runs the `whisper` app with the image `IMG1`, and the `resnet` app with the image `IMG2`. `podman images` command can be used to list the names of the images. Concretely, all deployment replicas in the applications start and run in containers with the respective images. ```yaml applications: From 1719a8f137e8c80920eb73879fa91d93bd1ea69f Mon Sep 17 00:00:00 2001 From: chuanzhisongshu <49055103+982945902@users.noreply.github.com> Date: Wed, 29 May 2024 23:58:23 +0800 Subject: [PATCH 39/65] [Core] Remove duplicate included header (#45406) Signed-off-by: lishuo121 --- src/ray/util/logging.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ray/util/logging.cc b/src/ray/util/logging.cc index 8e9f45d4b865..fdcf5d22b56c 100644 --- a/src/ray/util/logging.cc +++ b/src/ray/util/logging.cc @@ -16,7 +16,6 @@ #include -#include #ifdef _WIN32 #include #else From a84a1b2284c15c5ec17fd55afa479f0014abe508 Mon Sep 17 00:00:00 2001 From: Cindy Zhang Date: Wed, 29 May 2024 09:38:47 -0700 Subject: [PATCH 40/65] [runtime_env] unify `container` and (new) `image_uri` under plugin mechanism (#45156) Signed-off-by: Cindy Zhang --- .../runtime_env/agent/runtime_env_agent.py | 13 +- python/ray/_private/runtime_env/container.py | 72 ---------- python/ray/_private/runtime_env/context.py | 16 +-- .../ray/_private/runtime_env/default_impl.py | 5 + python/ray/_private/runtime_env/image_uri.py | 124 ++++++++++++++++++ python/ray/_private/runtime_env/plugin.py | 4 +- python/ray/_private/runtime_env/py_modules.py | 2 +- python/ray/runtime_env/runtime_env.py | 15 +++ python/ray/tests/conftest_docker.py | 2 +- .../test_log_file_exists.py | 13 +- .../runtime_env_container/test_put_get.py | 13 +- .../test_ray_env_vars.py | 15 ++- .../runtime_env_container/test_serve_basic.py | 16 ++- .../test_serve_telemetry.py | 17 ++- .../test_shared_memory.py | 12 +- ...xit_intended_system_exit_and_user_error.py | 20 ++- .../ray/tests/test_runtime_env_container.py | 37 +++++- 17 files changed, 280 insertions(+), 116 deletions(-) delete mode 100644 python/ray/_private/runtime_env/container.py create mode 100644 python/ray/_private/runtime_env/default_impl.py create mode 100644 python/ray/_private/runtime_env/image_uri.py diff --git a/python/ray/_private/runtime_env/agent/runtime_env_agent.py b/python/ray/_private/runtime_env/agent/runtime_env_agent.py index 830e0356aef6..59959f19aad3 100644 --- a/python/ray/_private/runtime_env/agent/runtime_env_agent.py +++ b/python/ray/_private/runtime_env/agent/runtime_env_agent.py @@ -14,9 +14,10 @@ import ray._private.runtime_env.agent.runtime_env_consts as runtime_env_consts from ray._private.ray_logging import setup_component_logger from ray._private.runtime_env.conda import CondaPlugin -from ray._private.runtime_env.container import ContainerManager from ray._private.runtime_env.context import RuntimeEnvContext +from ray._private.runtime_env.default_impl import get_image_uri_plugin from ray._private.runtime_env.java_jars import JavaJarsPlugin +from ray._private.runtime_env.image_uri import ContainerPlugin from ray._private.runtime_env.pip import PipPlugin from ray._private.gcs_utils import GcsAioClient from ray._private.runtime_env.plugin import ( @@ -200,11 +201,12 @@ def __init__( self._working_dir_plugin = WorkingDirPlugin( self._runtime_env_dir, self._gcs_aio_client ) + self._container_plugin = ContainerPlugin(temp_dir) # TODO(jonathan-anyscale): change the plugin to ProfilerPlugin # and unify with nsight and other profilers. self._nsight_plugin = NsightPlugin(self._runtime_env_dir) - self._container_manager = ContainerManager(temp_dir) self._mpi_plugin = MPIPlugin() + self._image_uri_plugin = get_image_uri_plugin(temp_dir) # TODO(architkulkarni): "base plugins" and third-party plugins should all go # through the same code path. We should never need to refer to @@ -215,8 +217,10 @@ def __init__( self._conda_plugin, self._py_modules_plugin, self._java_jars_plugin, + self._container_plugin, self._nsight_plugin, self._mpi_plugin, + self._image_uri_plugin, ] self._plugin_manager = RuntimeEnvPluginManager() for plugin in self._base_plugins: @@ -244,7 +248,7 @@ def __init__( "Listening to address %s, port %d", address, runtime_env_agent_port ) - def uris_parser(self, runtime_env): + def uris_parser(self, runtime_env: RuntimeEnv): result = list() for name, plugin_setup_context in self._plugin_manager.plugins.items(): plugin = plugin_setup_context.class_instance @@ -309,9 +313,6 @@ async def _setup_runtime_env( # avoid lint error. That will be moved to cgroup plugin. per_job_logger.debug(f"Worker has resource :" f"{allocated_resource}") context = RuntimeEnvContext(env_vars=runtime_env.env_vars()) - await self._container_manager.setup( - runtime_env, context, logger=per_job_logger - ) # Warn about unrecognized fields in the runtime env. for name, _ in runtime_env.plugins(): diff --git a/python/ray/_private/runtime_env/container.py b/python/ray/_private/runtime_env/container.py deleted file mode 100644 index afae8455549e..000000000000 --- a/python/ray/_private/runtime_env/container.py +++ /dev/null @@ -1,72 +0,0 @@ -import os -import logging - -from typing import Optional - -from ray._private.runtime_env.context import RuntimeEnvContext - -default_logger = logging.getLogger(__name__) - - -class ContainerManager: - def __init__(self, tmp_dir: str): - # _ray_tmp_dir will be mounted into container, so the worker process - # can connect to raylet. - self._ray_tmp_dir = tmp_dir - - async def setup( - self, - runtime_env: "RuntimeEnv", # noqa: F821 - context: RuntimeEnvContext, - logger: Optional[logging.Logger] = default_logger, - ): - if not runtime_env.has_py_container() or not runtime_env.py_container_image(): - return - - container_driver = "podman" - context.container = runtime_env["container"] - container_command = [ - container_driver, - "run", - "-v", - self._ray_tmp_dir + ":" + self._ray_tmp_dir, - "--cgroup-manager=cgroupfs", - "--network=host", - "--pid=host", - "--ipc=host", - # NOTE(zcin): Mounted volumes in rootless containers are - # owned by the user `root`. The user on host (which will - # usually be `ray` if this is being run in a ray docker - # image) who started the container is mapped using user - # namespaces to the user `root` in a rootless container. In - # order for the Ray Python worker to access the mounted ray - # tmp dir, we need to use keep-id mode which maps the user - # as itself (instead of as `root`) into the container. - # https://www.redhat.com/sysadmin/rootless-podman-user-namespace-modes - "--userns=keep-id", - ] - - # The RAY_RAYLET_PID and RAY_JOB_ID environment variables are - # needed for the default worker. - container_command.append("--env") - container_command.append("RAY_RAYLET_PID=" + os.getenv("RAY_RAYLET_PID")) - container_command.append("--env") - container_command.append("RAY_JOB_ID=$RAY_JOB_ID") - for env_var_name, env_var_value in os.environ.items(): - if env_var_name.startswith("RAY_") and env_var_name not in [ - "RAY_RAYLET_PID", - "RAY_JOB_ID", - ]: - container_command.append("--env") - container_command.append(f"{env_var_name}='{env_var_value}'") - - if runtime_env.py_container_run_options(): - container_command.extend(runtime_env.py_container_run_options()) - # TODO(chenk008): add resource limit - container_command.append("--entrypoint") - container_command.append("python") - container_command.append(runtime_env.py_container_image()) - context.py_executable = " ".join(container_command) - logger.info( - "start worker in container with prefix: {}".format(context.py_executable) - ) diff --git a/python/ray/_private/runtime_env/context.py b/python/ray/_private/runtime_env/context.py index 5eea9690f896..46a01af267be 100644 --- a/python/ray/_private/runtime_env/context.py +++ b/python/ray/_private/runtime_env/context.py @@ -4,7 +4,7 @@ import subprocess import shlex import sys -from typing import Any, Dict, List, Optional +from typing import Dict, List, Optional from ray.util.annotations import DeveloperAPI from ray.core.generated.common_pb2 import Language @@ -24,7 +24,7 @@ def __init__( env_vars: Dict[str, str] = None, py_executable: Optional[str] = None, resources_dir: Optional[str] = None, - container: Dict[str, Any] = None, + override_worker_entrypoint: Optional[str] = None, java_jars: List[str] = None, ): self.command_prefix = command_prefix or [] @@ -35,7 +35,7 @@ def __init__( # the legacy Ray client codepath to pass the resources dir to the shim # process. We should remove it once Ray client uses the agent. self.resources_dir: str = resources_dir - self.container = container or {} + self.override_worker_entrypoint: Optional[str] = override_worker_entrypoint self.java_jars = java_jars or [] def serialize(self) -> str: @@ -72,13 +72,13 @@ def exec_worker(self, passthrough_args: List[str], language: Language): # However, the path to default_worker.py inside the container # can be different. We need the user to specify the path to # default_worker.py inside the container. - default_worker_path = self.container.get("worker_path") - if self.container and default_worker_path: + # default_worker_path = self.container.get("worker_path") + if self.override_worker_entrypoint: logger.debug( - f"Changing the default worker path from {passthrough_args[0]} to " - f"{default_worker_path}." + f"Changing the worker entrypoint from {passthrough_args[0]} to " + f"{self.override_worker_entrypoint}." ) - passthrough_args[0] = default_worker_path + passthrough_args[0] = self.override_worker_entrypoint if sys.platform == "win32": diff --git a/python/ray/_private/runtime_env/default_impl.py b/python/ray/_private/runtime_env/default_impl.py new file mode 100644 index 000000000000..40fd5485be61 --- /dev/null +++ b/python/ray/_private/runtime_env/default_impl.py @@ -0,0 +1,5 @@ +from ray._private.runtime_env.image_uri import ImageURIPlugin + + +def get_image_uri_plugin(ray_tmp_dir: str): + return ImageURIPlugin(ray_tmp_dir) diff --git a/python/ray/_private/runtime_env/image_uri.py b/python/ray/_private/runtime_env/image_uri.py new file mode 100644 index 000000000000..ab8b2f779d8b --- /dev/null +++ b/python/ray/_private/runtime_env/image_uri.py @@ -0,0 +1,124 @@ +import logging +import os +from typing import List, Optional + +from ray._private.runtime_env.context import RuntimeEnvContext +from ray._private.runtime_env.plugin import RuntimeEnvPlugin + +default_logger = logging.getLogger(__name__) + + +def _modify_context_impl( + image_uri: str, + run_options: Optional[List[str]], + worker_path: Optional[str], + context: RuntimeEnvContext, + logger: logging.Logger, + ray_tmp_dir: str, +): + if worker_path: + context.override_worker_entrypoint = worker_path + + container_driver = "podman" + container_command = [ + container_driver, + "run", + "-v", + ray_tmp_dir + ":" + ray_tmp_dir, + "--cgroup-manager=cgroupfs", + "--network=host", + "--pid=host", + "--ipc=host", + # NOTE(zcin): Mounted volumes in rootless containers are + # owned by the user `root`. The user on host (which will + # usually be `ray` if this is being run in a ray docker + # image) who started the container is mapped using user + # namespaces to the user `root` in a rootless container. In + # order for the Ray Python worker to access the mounted ray + # tmp dir, we need to use keep-id mode which maps the user + # as itself (instead of as `root`) into the container. + # https://www.redhat.com/sysadmin/rootless-podman-user-namespace-modes + "--userns=keep-id", + ] + + # The RAY_RAYLET_PID and RAY_JOB_ID environment variables are + # needed for the default worker. + container_command.append("--env") + container_command.append("RAY_RAYLET_PID=" + os.getenv("RAY_RAYLET_PID")) + container_command.append("--env") + container_command.append("RAY_JOB_ID=$RAY_JOB_ID") + for env_var_name, env_var_value in os.environ.items(): + if env_var_name.startswith("RAY_") and env_var_name not in [ + "RAY_RAYLET_PID", + "RAY_JOB_ID", + ]: + container_command.append("--env") + container_command.append(f"{env_var_name}='{env_var_value}'") + + if run_options: + container_command.extend(run_options) + # TODO(chenk008): add resource limit + container_command.append("--entrypoint") + container_command.append("python") + container_command.append(image_uri) + + # Example: + # podman run -v /tmp/ray:/tmp/ray + # --cgroup-manager=cgroupfs --network=host --pid=host --ipc=host + # --userns=keep-id --env RAY_RAYLET_PID=23478 --env RAY_JOB_ID=$RAY_JOB_ID + # --entrypoint python rayproject/ray:nightly-py39 + container_command_str = " ".join(container_command) + logger.info(f"Starting worker in container with prefix {container_command_str}") + + context.py_executable = container_command_str + + +class ImageURIPlugin(RuntimeEnvPlugin): + """Starts worker in a container of a custom image.""" + + name = "image_uri" + + def __init__(self, ray_tmp_dir: str): + self._ray_tmp_dir = ray_tmp_dir + + def modify_context( + self, + uris: List[str], + runtime_env: "RuntimeEnv", # noqa: F821 + context: RuntimeEnvContext, + logger: Optional[logging.Logger] = default_logger, + ): + if not runtime_env.image_uri(): + return + + _modify_context_impl( + runtime_env.image_uri(), [], None, context, logger, self._ray_tmp_dir + ) + + +class ContainerPlugin(RuntimeEnvPlugin): + """Starts worker in container.""" + + name = "container" + + def __init__(self, ray_tmp_dir: str): + self._ray_tmp_dir = ray_tmp_dir + + def modify_context( + self, + uris: List[str], + runtime_env: "RuntimeEnv", # noqa: F821 + context: RuntimeEnvContext, + logger: Optional[logging.Logger] = default_logger, + ): + if not runtime_env.has_py_container() or not runtime_env.py_container_image(): + return + + _modify_context_impl( + runtime_env.py_container_image(), + runtime_env.py_container_run_options(), + runtime_env.py_container_worker_path(), + context, + logger, + self._ray_tmp_dir, + ) diff --git a/python/ray/_private/runtime_env/plugin.py b/python/ray/_private/runtime_env/plugin.py index 98d610b138f8..a1e03a507b59 100644 --- a/python/ray/_private/runtime_env/plugin.py +++ b/python/ray/_private/runtime_env/plugin.py @@ -47,7 +47,7 @@ def get_uris(self, runtime_env: "RuntimeEnv") -> List[str]: # noqa: F821 async def create( self, uri: Optional[str], - runtime_env: "RuntimeEnv", # noqa: F821 + runtime_env, context: RuntimeEnvContext, logger: logging.Logger, ) -> float: @@ -227,7 +227,7 @@ def sorted_plugin_setup_contexts(self) -> List[PluginSetupContext]: async def create_for_plugin_if_needed( - runtime_env, + runtime_env: "RuntimeEnv", # noqa: F821 plugin: RuntimeEnvPlugin, uri_cache: URICache, context: RuntimeEnvContext, diff --git a/python/ray/_private/runtime_env/py_modules.py b/python/ray/_private/runtime_env/py_modules.py index c78f10da5f20..551e22b3650a 100644 --- a/python/ray/_private/runtime_env/py_modules.py +++ b/python/ray/_private/runtime_env/py_modules.py @@ -178,7 +178,7 @@ def delete_uri( return local_dir_size - def get_uris(self, runtime_env: dict) -> List[str]: + def get_uris(self, runtime_env) -> List[str]: return runtime_env.py_modules() async def create( diff --git a/python/ray/runtime_env/runtime_env.py b/python/ray/runtime_env/runtime_env.py index 5cd49f5b32ee..6999e463e761 100644 --- a/python/ray/runtime_env/runtime_env.py +++ b/python/ray/runtime_env/runtime_env.py @@ -288,6 +288,7 @@ class MyClass: "worker_process_setup_hook", "_nsight", "mpi", + "image_uri", } extensions_fields: Set[str] = { @@ -310,6 +311,7 @@ def __init__( config: Optional[Union[Dict, RuntimeEnvConfig]] = None, _validate: bool = True, mpi: Optional[Dict] = None, + image_uri: Optional[str] = None, **kwargs, ): super().__init__() @@ -335,6 +337,8 @@ def __init__( runtime_env["worker_process_setup_hook"] = worker_process_setup_hook if mpi is not None: runtime_env["mpi"] = mpi + if image_uri is not None: + runtime_env["image_uri"] = image_uri if runtime_env.get("java_jars"): runtime_env["java_jars"] = runtime_env.get("java_jars") @@ -368,6 +372,14 @@ def __init__( f"Specified fields: {runtime_env.keys()}" ) + if self.get("image_uri"): + if len(runtime_env) > 1: + raise ValueError( + "The 'image_uri' field currently cannot be used " + "together with other fields of runtime_env. " + f"Specified fields: {runtime_env.keys()}" + ) + for option, validate_fn in OPTION_TO_VALIDATION_FN.items(): option_val = self.get(option) if option_val is not None: @@ -545,6 +557,9 @@ def py_container_run_options(self) -> List: return None return self["container"].get("run_options", []) + def image_uri(self) -> Optional[str]: + return self.get("image_uri") + def plugins(self) -> List[Tuple[str, Any]]: result = list() for key, value in self.items(): diff --git a/python/ray/tests/conftest_docker.py b/python/ray/tests/conftest_docker.py index 52eebb9d6a19..363cee6ccb5a 100644 --- a/python/ray/tests/conftest_docker.py +++ b/python/ray/tests/conftest_docker.py @@ -175,7 +175,7 @@ def run_in_container(cmds: List[List[str]], container_id: str): NESTED_IMAGE_NAME = "rayproject/ray:runtime_env_container_nested" -@pytest.fixture +@pytest.fixture(scope="session") def podman_docker_cluster(): start_container_command = [ "docker", diff --git a/python/ray/tests/runtime_env_container/test_log_file_exists.py b/python/ray/tests/runtime_env_container/test_log_file_exists.py index 5f6cc2e87c07..4288f1fba4be 100644 --- a/python/ray/tests/runtime_env_container/test_log_file_exists.py +++ b/python/ray/tests/runtime_env_container/test_log_file_exists.py @@ -7,6 +7,11 @@ parser = argparse.ArgumentParser() parser.add_argument("--image", type=str, help="The docker image to use for Ray worker") +parser.add_argument( + "--use-image-uri-api", + action="store_true", + help="Whether to use the new `image_uri` API instead of the old `container` API.", +) args = parser.parse_args() worker_pth = get_ray_default_worker_file_path() @@ -27,8 +32,14 @@ def task_finished(): return True +if args.use_image_uri_api: + runtime_env = {"image_uri": args.image} +else: + runtime_env = {"container": {"image": args.image, "worker_path": worker_pth}} + + # Run a basic workload. -@ray.remote(runtime_env={"container": {"image": args.image, "worker_path": worker_pth}}) +@ray.remote(runtime_env=runtime_env) def f(): for i in range(10): print(f"test {i}") diff --git a/python/ray/tests/runtime_env_container/test_put_get.py b/python/ray/tests/runtime_env_container/test_put_get.py index a42fc330f4db..d3d58727068d 100644 --- a/python/ray/tests/runtime_env_container/test_put_get.py +++ b/python/ray/tests/runtime_env_container/test_put_get.py @@ -5,12 +5,23 @@ parser = argparse.ArgumentParser() parser.add_argument("--image", type=str, help="The docker image to use for Ray worker") +parser.add_argument( + "--use-image-uri-api", + action="store_true", + help="Whether to use the new `image_uri` API instead of the old `container` API.", +) args = parser.parse_args() worker_pth = get_ray_default_worker_file_path() -@ray.remote(runtime_env={"container": {"image": args.image, "worker_path": worker_pth}}) +if args.use_image_uri_api: + runtime_env = {"image_uri": args.image} +else: + runtime_env = {"container": {"image": args.image, "worker_path": worker_pth}} + + +@ray.remote(runtime_env=runtime_env) def create_ref(): with open("file.txt") as f: assert f.read().strip() == "helloworldalice" diff --git a/python/ray/tests/runtime_env_container/test_ray_env_vars.py b/python/ray/tests/runtime_env_container/test_ray_env_vars.py index b7288caaa29f..d97b714f895b 100644 --- a/python/ray/tests/runtime_env_container/test_ray_env_vars.py +++ b/python/ray/tests/runtime_env_container/test_ray_env_vars.py @@ -6,17 +6,28 @@ parser = argparse.ArgumentParser() parser.add_argument("--image", type=str, help="The docker image to use for Ray worker") +parser.add_argument( + "--use-image-uri-api", + action="store_true", + help="Whether to use the new `image_uri` API instead of the old `container` API.", +) args = parser.parse_args() worker_pth = get_ray_default_worker_file_path() -@ray.remote(runtime_env={"container": {"image": args.image, "worker_path": worker_pth}}) +if args.use_image_uri_api: + runtime_env = {"image_uri": args.image} +else: + runtime_env = {"container": {"image": args.image, "worker_path": worker_pth}} + + +@ray.remote(runtime_env=runtime_env) def f(): return os.environ.get("RAY_TEST_ABC") -@ray.remote(runtime_env={"container": {"image": args.image, "worker_path": worker_pth}}) +@ray.remote(runtime_env=runtime_env) def g(): return os.environ.get("TEST_ABC") diff --git a/python/ray/tests/runtime_env_container/test_serve_basic.py b/python/ray/tests/runtime_env_container/test_serve_basic.py index b9fa3356088d..f74412201834 100644 --- a/python/ray/tests/runtime_env_container/test_serve_basic.py +++ b/python/ray/tests/runtime_env_container/test_serve_basic.py @@ -5,16 +5,22 @@ parser = argparse.ArgumentParser() parser.add_argument("--image", type=str, help="The docker image to use for Ray worker") +parser.add_argument( + "--use-image-uri-api", + action="store_true", + help="Whether to use the new `image_uri` API instead of the old `container` API.", +) args = parser.parse_args() WORKER_PATH = get_ray_default_worker_file_path() +if args.use_image_uri_api: + runtime_env = {"image_uri": args.image} +else: + runtime_env = {"container": {"image": args.image, "worker_path": WORKER_PATH}} -@serve.deployment( - ray_actor_options={ - "runtime_env": {"container": {"image": args.image, "worker_path": WORKER_PATH}} - } -) + +@serve.deployment(ray_actor_options={"runtime_env": runtime_env}) class Model: def __call__(self): with open("file.txt") as f: diff --git a/python/ray/tests/runtime_env_container/test_serve_telemetry.py b/python/ray/tests/runtime_env_container/test_serve_telemetry.py index 8f361cbc93e1..abcc44ab6053 100644 --- a/python/ray/tests/runtime_env_container/test_serve_telemetry.py +++ b/python/ray/tests/runtime_env_container/test_serve_telemetry.py @@ -16,6 +16,11 @@ parser = argparse.ArgumentParser( description="Example Python script taking command line arguments." ) +parser.add_argument( + "--use-image-uri-api", + action="store_true", + help="Whether to use the new `image_uri` API instead of the old `container` API.", +) parser.add_argument("--image", type=str, help="The docker image to use for Ray worker") args = parser.parse_args() worker_pth = get_ray_default_worker_file_path() @@ -25,6 +30,12 @@ os.environ["RAY_USAGE_STATS_REPORT_INTERVAL_S"] = "1" +if args.use_image_uri_api: + runtime_env = {"image_uri": args.image} +else: + runtime_env = {"container": {"image": args.image, "worker_path": worker_pth}} + + def check_app(app_name: str, expected: str): app_handle = serve.get_app_handle(app_name) ref = app_handle.remote() @@ -88,7 +99,7 @@ def check_telemetry_deployment(): { "name": "app1", "import_path": "serve_application:app", - "runtime_env": {"container": {"image": args.image, "worker_path": worker_pth}}, + "runtime_env": runtime_env, }, ) client.deploy_apps(ServeDeploySchema.parse_obj(config)) @@ -104,9 +115,7 @@ def check_telemetry_deployment(): { "name": "Model", "ray_actor_options": { - "runtime_env": { - "container": {"image": args.image, "worker_path": worker_pth} - }, + "runtime_env": runtime_env, }, } ], diff --git a/python/ray/tests/runtime_env_container/test_shared_memory.py b/python/ray/tests/runtime_env_container/test_shared_memory.py index da5260957bf8..bc697eca0f16 100644 --- a/python/ray/tests/runtime_env_container/test_shared_memory.py +++ b/python/ray/tests/runtime_env_container/test_shared_memory.py @@ -7,12 +7,22 @@ parser = argparse.ArgumentParser() parser.add_argument("--image", type=str, help="The docker image to use for Ray worker") +parser.add_argument( + "--use-image-uri-api", + action="store_true", + help="Whether to use the new `image_uri` API instead of the old `container` API.", +) args = parser.parse_args() worker_pth = get_ray_default_worker_file_path() +if args.use_image_uri_api: + runtime_env = {"image_uri": args.image} +else: + runtime_env = {"container": {"image": args.image, "worker_path": worker_pth}} -@ray.remote(runtime_env={"container": {"image": args.image, "worker_path": worker_pth}}) + +@ray.remote(runtime_env=runtime_env) def f(): array = np.random.rand(5000, 5000) return ray.put(array) diff --git a/python/ray/tests/runtime_env_container/test_worker_exit_intended_system_exit_and_user_error.py b/python/ray/tests/runtime_env_container/test_worker_exit_intended_system_exit_and_user_error.py index d7dc1964f771..b3ebde5853bf 100644 --- a/python/ray/tests/runtime_env_container/test_worker_exit_intended_system_exit_and_user_error.py +++ b/python/ray/tests/runtime_env_container/test_worker_exit_intended_system_exit_and_user_error.py @@ -10,9 +10,19 @@ parser = argparse.ArgumentParser() parser.add_argument("--image", type=str, help="The docker image to use for Ray worker") +parser.add_argument( + "--use-image-uri-api", + action="store_true", + help="Whether to use the new `image_uri` API instead of the old `container` API.", +) args = parser.parse_args() + worker_pth = get_ray_default_worker_file_path() +if args.use_image_uri_api: + runtime_env = {"image_uri": args.image} +else: + runtime_env = {"container": {"image": args.image, "worker_path": worker_pth}} ray.init(num_cpus=1) @@ -24,12 +34,12 @@ def get_worker_by_pid(pid, detail=True): assert False -@ray.remote(runtime_env={"container": {"image": args.image, "worker_path": worker_pth}}) +@ray.remote(runtime_env=runtime_env) def f(): return ray.get(g.remote()) -@ray.remote(runtime_env={"container": {"image": args.image, "worker_path": worker_pth}}) +@ray.remote(runtime_env=runtime_env) def g(): return os.getpid() @@ -54,7 +64,7 @@ def verify_exit_by_idle_timeout(): @ray.remote( num_cpus=1, - runtime_env={"container": {"image": args.image, "worker_path": worker_pth}}, + runtime_env=runtime_env, ) class A: def __init__(self): @@ -94,7 +104,7 @@ def verify_exit_by_pg_removed(): wait_for_condition(verify_exit_by_pg_removed) -@ray.remote(runtime_env={"container": {"image": args.image, "worker_path": worker_pth}}) +@ray.remote(runtime_env=runtime_env) class PidDB: def __init__(self): self.pid = None @@ -109,7 +119,7 @@ def get_pid(self): p = PidDB.remote() -@ray.remote(runtime_env={"container": {"image": args.image, "worker_path": worker_pth}}) +@ray.remote(runtime_env=runtime_env) class FaultyActor: def __init__(self): p.record_pid.remote(os.getpid()) diff --git a/python/ray/tests/test_runtime_env_container.py b/python/ray/tests/test_runtime_env_container.py index 721bfa68c97b..963f510479e4 100644 --- a/python/ray/tests/test_runtime_env_container.py +++ b/python/ray/tests/test_runtime_env_container.py @@ -16,43 +16,58 @@ @pytest.mark.skipif(sys.platform != "linux", reason="Only works on Linux.") -def test_put_get(podman_docker_cluster): +@pytest.mark.parametrize("use_image_uri_api", [True, False]) +def test_put_get(podman_docker_cluster, use_image_uri_api): """Test ray.put and ray.get.""" container_id = podman_docker_cluster cmd = ["python", "tests/test_put_get.py", "--image", NESTED_IMAGE_NAME] + if use_image_uri_api: + cmd.append("--use-image-uri-api") run_in_container([cmd], container_id) @pytest.mark.skipif(sys.platform != "linux", reason="Only works on Linux.") -def test_shared_memory(podman_docker_cluster): +@pytest.mark.parametrize("use_image_uri_api", [True, False]) +def test_shared_memory(podman_docker_cluster, use_image_uri_api): """Test shared memory.""" container_id = podman_docker_cluster cmd = ["python", "tests/test_shared_memory.py", "--image", NESTED_IMAGE_NAME] + if use_image_uri_api: + cmd.append("--use-image-uri-api") run_in_container([cmd], container_id) @pytest.mark.skipif(sys.platform != "linux", reason="Only works on Linux.") -def test_log_file_exists(podman_docker_cluster): +@pytest.mark.parametrize("use_image_uri_api", [True, False]) +def test_log_file_exists(podman_docker_cluster, use_image_uri_api): """Verify worker log file exists""" container_id = podman_docker_cluster cmd = ["python", "tests/test_log_file_exists.py", "--image", NESTED_IMAGE_NAME] + if use_image_uri_api: + cmd.append("--use-image-uri-api") run_in_container([cmd], container_id) @pytest.mark.skipif(sys.platform != "linux", reason="Only works on Linux.") -def test_ray_env_vars(podman_docker_cluster): +@pytest.mark.parametrize("use_image_uri_api", [True, False]) +def test_ray_env_vars(podman_docker_cluster, use_image_uri_api): """Test ray.put and ray.get.""" container_id = podman_docker_cluster cmd = ["python", "tests/test_ray_env_vars.py", "--image", NESTED_IMAGE_NAME] + if use_image_uri_api: + cmd.append("--use-image-uri-api") run_in_container([cmd], container_id) @pytest.mark.skipif(sys.platform != "linux", reason="Only works on Linux.") -def test_worker_exit_intended_system_exit_and_user_error(podman_docker_cluster): +@pytest.mark.parametrize("use_image_uri_api", [True, False]) +def test_worker_exit_intended_system_exit_and_user_error( + podman_docker_cluster, use_image_uri_api +): """ INTENDED_SYSTEM_EXIT - (not tested, hard to test) Unused resource removed @@ -69,25 +84,33 @@ def test_worker_exit_intended_system_exit_and_user_error(podman_docker_cluster): "--image", NESTED_IMAGE_NAME, ] + if use_image_uri_api: + cmd.append("--use-image-uri-api") run_in_container([cmd], container_id) @pytest.mark.skipif(sys.platform != "linux", reason="Only works on Linux.") -def test_serve_basic(podman_docker_cluster): +@pytest.mark.parametrize("use_image_uri_api", [True, False]) +def test_serve_basic(podman_docker_cluster, use_image_uri_api): """Test Serve deployment.""" container_id = podman_docker_cluster cmd = ["python", "tests/test_serve_basic.py", "--image", NESTED_IMAGE_NAME] + if use_image_uri_api: + cmd.append("--use-image-uri-api") run_in_container([cmd], container_id) @pytest.mark.skipif(sys.platform != "linux", reason="Only works on Linux.") @pytest.mark.skip -def test_serve_telemetry(podman_docker_cluster): +@pytest.mark.parametrize("use_image_uri_api", [True, False]) +def test_serve_telemetry(podman_docker_cluster, use_image_uri_api): """Test Serve deployment telemetry.""" container_id = podman_docker_cluster cmd = ["python", "tests/test_serve_telemetry.py", "--image", NESTED_IMAGE_NAME] + if use_image_uri_api: + cmd.append("--use-image-uri-api") run_in_container([cmd], container_id) From 3f29274fada445fdbc4fdeb6dc6c0d6d58f24ca7 Mon Sep 17 00:00:00 2001 From: Sven Mika Date: Wed, 29 May 2024 19:00:59 +0200 Subject: [PATCH 41/65] [RLlib] Fix wrong `env` being passed into `on_episode_end` callback on MultiAgentEnvRunner when sampling whole episodes. (#45617) --- .../tests/test_callbacks_on_env_runner.py | 56 +++++++++++++++---- rllib/env/multi_agent_env_runner.py | 2 +- 2 files changed, 47 insertions(+), 11 deletions(-) diff --git a/rllib/algorithms/tests/test_callbacks_on_env_runner.py b/rllib/algorithms/tests/test_callbacks_on_env_runner.py index 34329c20bf41..ce7d97444cd3 100644 --- a/rllib/algorithms/tests/test_callbacks_on_env_runner.py +++ b/rllib/algorithms/tests/test_callbacks_on_env_runner.py @@ -1,10 +1,15 @@ from collections import Counter import unittest +import gymnasium as gym + import ray +from ray import tune from ray.rllib.algorithms.callbacks import DefaultCallbacks from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.utils.test_utils import framework_iterator +from ray.rllib.env.env_runner import EnvRunner +from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole +from ray.rllib.utils.metrics.metrics_logger import MetricsLogger class EpisodeAndSampleCallbacks(DefaultCallbacks): @@ -12,24 +17,38 @@ def __init__(self): super().__init__() self.counts = Counter() - def on_environment_created(self, *args, **kwargs): + def on_environment_created(self, *args, env_runner, metrics_logger, env, **kwargs): + self.counts.update({"env_created": 1}) - def on_episode_start(self, *args, **kwargs): + def on_episode_start(self, *args, env_runner, metrics_logger, env, **kwargs): + assert isinstance(env_runner, EnvRunner) + assert isinstance(metrics_logger, MetricsLogger) + assert isinstance(env, gym.Env) self.counts.update({"start": 1}) - def on_episode_step(self, *args, **kwargs): + def on_episode_step(self, *args, env_runner, metrics_logger, env, **kwargs): + assert isinstance(env_runner, EnvRunner) + assert isinstance(metrics_logger, MetricsLogger) + assert isinstance(env, gym.Env) self.counts.update({"step": 1}) - def on_episode_end(self, *args, **kwargs): + def on_episode_end(self, *args, env_runner, metrics_logger, env, **kwargs): + assert isinstance(env_runner, EnvRunner) + assert isinstance(metrics_logger, MetricsLogger) + assert isinstance(env, gym.Env) self.counts.update({"end": 1}) - def on_sample_end(self, *args, **kwargs): + def on_sample_end(self, *args, env_runner, metrics_logger, **kwargs): + assert isinstance(env_runner, EnvRunner) + assert isinstance(metrics_logger, MetricsLogger) self.counts.update({"sample": 1}) class OnEnvironmentCreatedCallback(DefaultCallbacks): def on_environment_created(self, *, env_runner, env, env_context, **kwargs): + assert isinstance(env_runner, EnvRunner) + assert isinstance(env, gym.Env) # Create a vector-index-sum property per remote worker. if not hasattr(env_runner, "sum_sub_env_vector_indices"): env_runner.sum_sub_env_vector_indices = 0 @@ -63,6 +82,7 @@ def on_episode_created( class TestCallbacks(unittest.TestCase): @classmethod def setUpClass(cls): + tune.register_env("ma_cart", lambda _: MultiAgentCartPole({"num_agents": 2})) ray.init() @classmethod @@ -88,7 +108,14 @@ def test_episode_and_sample_callbacks_batch_mode_truncate_episodes(self): num_sgd_iter=1, ) ) - for _ in framework_iterator(config, frameworks=("torch", "tf2")): + + for multi_agent in [False, True]: + if multi_agent: + config.multi_agent( + policies={"p0", "p1"}, + policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", + ) + config.environment("ma_cart") algo = config.build() callback_obj = algo.workers.local_worker()._callbacks @@ -133,7 +160,15 @@ def test_episode_and_sample_callbacks_batch_mode_complete_episodes(self): num_sgd_iter=1, ) ) - for _ in framework_iterator(config, frameworks=("torch", "tf2")): + + for multi_agent in [False, True]: + if multi_agent: + config.multi_agent( + policies={"p0", "p1"}, + policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", + ) + config.environment("ma_cart") + algo = config.build() callback_obj = algo.workers.local_worker()._callbacks @@ -142,8 +177,9 @@ def test_episode_and_sample_callbacks_batch_mode_complete_episodes(self): # Train one iteration. algo.train() - # We must have has exactly one `sample()` call on our EnvRunner. - self.assertEqual(callback_obj.counts["sample"], 1) + # We must have had exactly one `sample()` call on our EnvRunner. + if not multi_agent: + self.assertEqual(callback_obj.counts["sample"], 1) # We should have had at least one episode start. self.assertGreater(callback_obj.counts["start"], 0) # Episode starts must be exact same as episode ends (b/c we always complete diff --git a/rllib/env/multi_agent_env_runner.py b/rllib/env/multi_agent_env_runner.py index 077adbdad589..e179d047cfc9 100644 --- a/rllib/env/multi_agent_env_runner.py +++ b/rllib/env/multi_agent_env_runner.py @@ -551,7 +551,7 @@ def _sample_episodes( # but after(!) the last env-to-module connector call has been made. # -> All obs (even the terminal one) should have been processed now (by # the connector, if applicable). - self._make_on_episode_callback("on_episode_end") + self._make_on_episode_callback("on_episode_end", _episode) # Finish the episode. done_episodes_to_return.append( From 14bf327349aa5ab71756501725e200718fb10eb0 Mon Sep 17 00:00:00 2001 From: Lonnie Liu <95255098+aslonnie@users.noreply.github.com> Date: Wed, 29 May 2024 10:45:52 -0700 Subject: [PATCH 42/65] [ci] add security requirements constraint (#45616) for bumping package versions up in the container and dodging cve's also upgrade `idna` and add missing `cupy-cuda11x` package in constraints.. Signed-off-by: Lonnie Liu --- ci/ci.sh | 3 ++- python/requirements/security-requirements.txt | 7 +++++++ python/requirements_compiled.txt | 8 +++++++- 3 files changed, 16 insertions(+), 2 deletions(-) create mode 100644 python/requirements/security-requirements.txt diff --git a/ci/ci.sh b/ci/ci.sh index d29ee58c3fe0..3a09f7f155b8 100755 --- a/ci/ci.sh +++ b/ci/ci.sh @@ -106,7 +106,8 @@ compile_pip_dependencies() { "${WORKSPACE_DIR}/python/requirements/ml/train-requirements.txt" \ "${WORKSPACE_DIR}/python/requirements/ml/train-test-requirements.txt" \ "${WORKSPACE_DIR}/python/requirements/ml/tune-requirements.txt" \ - "${WORKSPACE_DIR}/python/requirements/ml/tune-test-requirements.txt" + "${WORKSPACE_DIR}/python/requirements/ml/tune-test-requirements.txt" \ + "${WORKSPACE_DIR}/python/requirements/security-requirements.txt" # Remove some pins from upstream dependencies: # ray, xgboost-ray, lightgbm-ray, tune-sklearn diff --git a/python/requirements/security-requirements.txt b/python/requirements/security-requirements.txt new file mode 100644 index 000000000000..c6c562790098 --- /dev/null +++ b/python/requirements/security-requirements.txt @@ -0,0 +1,7 @@ +# Requirement constraints for security. +# +# Only for constraining version ranges. Packages listed might not be installed, +# might not be ray requirements. They are only used in compiling the +# constraint file + +idna>=3.7 diff --git a/python/requirements_compiled.txt b/python/requirements_compiled.txt index d95e738ae605..4eedb59f751f 100644 --- a/python/requirements_compiled.txt +++ b/python/requirements_compiled.txt @@ -372,6 +372,8 @@ cryptography==38.0.1 # python-jose # sshpubkeys # trustme +cupy-cuda11x==13.1.0 ; sys_platform != "darwin" + # via -r /ray/ci/../python/requirements/ml/dl-cpu-requirements.txt cycler==0.12.1 # via matplotlib cython==0.29.37 @@ -496,6 +498,8 @@ fasteners==0.19 # gsutil fastjsonschema==2.19.0 # via nbformat +fastrlock==0.8.2 + # via cupy-cuda11x feather-format==0.4.1 # via -r /ray/ci/../python/requirements/test-requirements.txt ffmpy==0.3.1 @@ -765,8 +769,9 @@ humanfriendly==10.0 # coloredlogs hyperopt==0.2.7 # via -r /ray/ci/../python/requirements/ml/tune-requirements.txt -idna==3.6 +idna==3.7 # via + # -r /ray/ci/../python/requirements/security-requirements.txt # anyio # httpx # jsonschema @@ -1239,6 +1244,7 @@ numpy==1.24.4 # cmdstanpy # configspace # contourpy + # cupy-cuda11x # dask # datasets # deepspeed From 75161ab72f8d76c9365756b5d86f111582bbb42a Mon Sep 17 00:00:00 2001 From: Lonnie Liu <95255098+aslonnie@users.noreply.github.com> Date: Wed, 29 May 2024 12:01:36 -0700 Subject: [PATCH 43/65] [deps] remove duplicated lines in requirements.txt (#45615) some packages are declared more than once. Signed-off-by: Lonnie Liu --- python/requirements.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/requirements.txt b/python/requirements.txt index efab173a8344..e937c7352584 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -34,7 +34,6 @@ aiorwlock opentelemetry-exporter-otlp scipy colorful -pyyaml rich opentelemetry-sdk fastapi @@ -47,7 +46,6 @@ dm_tree uvicorn scikit-image==0.21.0 prometheus_client>=0.7.1 -requests pandas tensorboardX aiohttp>=3.7,<3.9 @@ -57,5 +55,4 @@ fsspec pandas>=1.3 pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3 # Serve users can use pydantic<2 py-spy>=0.2.0 -watchfiles memray; sys_platform != "win32" # memray is not supported on Windows From 8752932b2505e4663619499aab44845a72f28ef8 Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Wed, 29 May 2024 13:38:01 -0700 Subject: [PATCH 44/65] [ADAG] Support multi-args and kwargs in Accelerate DAG (#45545) This PR adds multi-arg and kwarg support by serializing all positional args and kwargs and passing it through the channel. When the channel is read at runtime, the individual args are extracted first before passing to the consuming tasks. Closes #42793 --------- Signed-off-by: Rui Qiao Signed-off-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> --- python/ray/dag/compiled_dag_node.py | 247 ++++++++++++------ .../experimental/test_accelerated_dag.py | 173 ++++++++++-- 2 files changed, 314 insertions(+), 106 deletions(-) diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py index e0cb598d9444..d53472f4a1d6 100644 --- a/python/ray/dag/compiled_dag_node.py +++ b/python/ray/dag/compiled_dag_node.py @@ -106,15 +106,6 @@ def _prep_task(self, task: "ExecutableTask") -> None: """ Prepare the task for execution. """ - # Add placeholders for input channels. - for idx, inp in enumerate(task.resolved_args): - if isinstance(inp, ChannelInterface): - task.input_channels.append(inp) - task.input_channel_idxs.append(idx) - task.resolved_inputs.append(None) - else: - task.resolved_inputs.append(inp) - for typ_hint in task.input_type_hints: typ_hint.register_custom_serializer() task.output_type_hint.register_custom_serializer() @@ -158,11 +149,12 @@ def _exec_task(self, task: "ExecutableTask", idx: int) -> bool: input_reader.end_read() return False - for idx, output in zip(task.input_channel_idxs, res): - task.resolved_inputs[idx] = output + resolved_inputs = [] + for task_input in task.task_inputs: + resolved_inputs.append(task_input.resolve(res)) try: - output_val = method(*task.resolved_inputs) + output_val = method(*resolved_inputs) output_writer.write(output_val) except Exception as exc: output_writer.write(_wrap_exception(exc)) @@ -243,6 +235,84 @@ def __str__(self) -> str: """ +@DeveloperAPI +class DAGInputAdapter: + """Adapter to extract individual positional arguments and kwargs + from objects read from DAG input channel.""" + + def __init__( + self, + input_attr_node: Optional["ray.dag.InputAttributeNode"], + dag_input_channel: "ray.experimental.channel.ChannelInterface", + ): + """ + Args: + input_attr_node: The input attribute node that this adapter is + created for. None should be used when creating an adapter for + the DAG input node itself; in this case, the adapter will + extract the 0th positional argument. + dag_input_channel: The DAG input channel. + """ + self._dag_input_channel = dag_input_channel + + def extractor(key: Union[int, str]): + def extract_arg(args_tuple): + positional_args, kwargs = args_tuple + if isinstance(key, int): + return positional_args[key] + else: + return kwargs[key] + + return extract_arg + + if input_attr_node: + key = input_attr_node.get_other_args_to_resolve()["key"] + else: + key = 0 + self._adapt_method = extractor(key) + + def adapt(self, input): + return self._adapt_method(input) + + def get_dag_input_channel(self): + return self._dag_input_channel + + +class _ExecutableTaskInput: + """Represents an input to an ExecutableTask. + + Args: + input_variant: either an unresolved input (when type is ChannelInterface + or DAGInputAdapter), or a resolved input value (when type is Any) + channel_idx: if input_variant is an unresolved input, this is the index + into the input channels list. + """ + + def __init__( + self, + input_variant: Union[ChannelInterface, DAGInputAdapter, Any], + channel_idx: Optional[int], + ): + self.input_variant = input_variant + self.channel_idx = channel_idx + + def resolve(self, channel_results: Any): + """ + Resolve the input value from the channel results. + + Args: + channel_results: The results from reading the input channels. + """ + if isinstance(self.input_variant, ChannelInterface): + value = channel_results[self.channel_idx] + elif isinstance(self.input_variant, DAGInputAdapter): + adapter = self.input_variant + value = adapter.adapt(channel_results[self.channel_idx]) + else: + value = self.input_variant + return value + + @DeveloperAPI class ExecutableTask: """A task that can be executed in a compiled DAG, and it @@ -265,13 +335,37 @@ def __init__( self.method_name = task.dag_node.get_method_name() self.bind_index = task.dag_node._get_bind_index() self.output_channel = task.output_channel - self.resolved_args = resolved_args self.input_type_hints: List["ChannelOutputType"] = task.arg_type_hints self.output_type_hint: "ChannelOutputType" = task.dag_node.type_hint - self.resolved_inputs: List[Union[Any, ChannelInterface]] = [] self.input_channels: List[ChannelInterface] = [] - self.input_channel_idxs: List[int] = [] + self.task_inputs: List[_ExecutableTaskInput] = [] + + # Reverse map for input_channels: maps an input channel to + # its index in input_channels. + input_channel_to_idx: dict[ChannelInterface, int] = {} + + for arg in resolved_args: + if isinstance(arg, ChannelInterface) or isinstance(arg, DAGInputAdapter): + if isinstance(arg, ChannelInterface): + channel = arg + else: + adapter = arg + channel = adapter.get_dag_input_channel() + + if channel in input_channel_to_idx: + # The same channel was added before, so reuse the index. + channel_idx = input_channel_to_idx[channel] + else: + # Add a new channel to the list of input channels. + self.input_channels.append(channel) + channel_idx = len(self.input_channels) - 1 + input_channel_to_idx[channel] = channel_idx + + task_input = _ExecutableTaskInput(arg, channel_idx) + else: + task_input = _ExecutableTaskInput(arg, None) + self.task_inputs.append(task_input) @DeveloperAPI @@ -398,22 +492,28 @@ def _preprocess(self) -> None: nccl_actors: Set["ray.actor.ActorHandle"] = set() + # Find the input node to the DAG. + for idx, task in self.idx_to_task.items(): + if isinstance(task.dag_node, InputNode): + assert self.input_task_idx is None, "more than one InputNode found" + self.input_task_idx = idx + # TODO: Support no-input DAGs (use an empty object to signal). + if self.input_task_idx is None: + raise NotImplementedError( + "Compiled DAGs currently require exactly one InputNode" + ) + # For each task node, set its upstream and downstream task nodes. # Also collect the set of tasks that produce torch.tensors. for node_idx, task in self.idx_to_task.items(): dag_node = task.dag_node if not ( isinstance(dag_node, InputNode) + or isinstance(dag_node, InputAttributeNode) or isinstance(dag_node, MultiOutputNode) or isinstance(dag_node, ClassMethodNode) ): - if isinstance(dag_node, InputAttributeNode): - # TODO(swang): Support multi args. - raise NotImplementedError( - "Compiled DAGs currently do not support kwargs or " - "multiple args for InputNode" - ) - elif isinstance(dag_node, FunctionNode): + if isinstance(dag_node, FunctionNode): # TODO(swang): Support non-actor tasks. raise NotImplementedError( "Compiled DAGs currently only support actor method nodes" @@ -456,6 +556,12 @@ def _preprocess(self) -> None: downstream_actor_handle = None if isinstance(task.dag_node, ClassMethodNode): downstream_actor_handle = task.dag_node._get_actor_handle() + + # If the upstream node is an InputAttributeNode, treat the + # DAG's input node as the actual upstream node + if isinstance(upstream_node.dag_node, InputAttributeNode): + upstream_node = self.idx_to_task[self.input_task_idx] + upstream_node.downstream_node_idxs[node_idx] = downstream_actor_handle task.arg_type_hints.append(upstream_node.dag_node.type_hint) @@ -466,19 +572,12 @@ def _preprocess(self) -> None: if dag_node.type_hint is not None: self._type_hints.append(dag_node.type_hint) - # Find the input node to the DAG. - for idx, task in self.idx_to_task.items(): - if isinstance(task.dag_node, InputNode): - assert self.input_task_idx is None, "more than one InputNode found" - self.input_task_idx = idx - # TODO: Support no-input DAGs (use an empty object to signal). - if self.input_task_idx is None: - raise NotImplementedError( - "Compiled DAGs currently require exactly one InputNode" - ) - # Find the (multi-)output node to the DAG. for idx, task in self.idx_to_task.items(): + if idx == self.input_task_idx or isinstance( + task.dag_node, InputAttributeNode + ): + continue if len(task.downstream_node_idxs) == 0: assert self.output_task_idx is None, "More than one output node found" self.output_task_idx = idx @@ -516,7 +615,13 @@ def _get_or_compile( _dag_output_fetcher will be set and can be used to invoke and fetch outputs for the DAG. """ - from ray.dag import DAGNode, InputNode, MultiOutputNode, ClassMethodNode + from ray.dag import ( + DAGNode, + InputNode, + InputAttributeNode, + MultiOutputNode, + ClassMethodNode, + ) if self.input_task_idx is None: self._preprocess() @@ -582,35 +687,32 @@ def _get_node_id(self): self.actor_refs.add(actor_handle) self.actor_to_tasks[actor_handle].append(task) elif isinstance(task.dag_node, InputNode): - readers = [self.idx_to_task[idx] for idx in task.downstream_node_idxs] - reader_handles = [] reader_handles_set = set() - for reader in readers: - reader_handle = reader.dag_node._get_actor_handle() - if reader_handle in reader_handles_set: - raise NotImplementedError( - "Compiled DAGs currently do not support binding the " - "same input on the same actor multiple times. " - f"Violating actor: {reader_handle}" - ) + for idx in task.downstream_node_idxs: + reader_task = self.idx_to_task[idx] + assert isinstance(reader_task.dag_node, ClassMethodNode) + reader_handle = reader_task.dag_node._get_actor_handle() reader_handles_set.add(reader_handle) - reader_handles.append(reader_handle) task.output_channel = do_allocate_channel( self, - reader_handles, + list(reader_handles_set), typ=type_hint, ) else: - assert isinstance(task.dag_node, MultiOutputNode) + assert isinstance(task.dag_node, InputAttributeNode) or isinstance( + task.dag_node, MultiOutputNode + ) for idx in task.downstream_node_idxs: frontier.append(idx) # Validate input channels for tasks that have not been visited for node_idx, task in self.idx_to_task.items(): - if node_idx == self.input_task_idx: - continue - if node_idx == self.output_task_idx: + if ( + node_idx == self.input_task_idx + or node_idx == self.output_task_idx + or isinstance(task.dag_node, InputAttributeNode) + ): continue if node_idx not in visited: has_at_least_one_channel_input = False @@ -623,6 +725,11 @@ def _get_node_id(self): "or at least one other DAGNode as an input" ) + input_task = self.idx_to_task[self.input_task_idx] + # Register custom serializers for inputs provided to dag.execute(). + input_task.dag_node.type_hint.register_custom_serializer() + self.dag_input_channel = input_task.output_channel + # Create executable tasks for each actor for actor_handle, tasks in self.actor_to_tasks.items(): executable_tasks = [] @@ -631,7 +738,15 @@ def _get_node_id(self): resolved_args = [] has_at_least_one_channel_input = False for arg in task.args: - if isinstance(arg, DAGNode): + if isinstance(arg, InputNode): + input_adapter = DAGInputAdapter(None, self.dag_input_channel) + resolved_args.append(input_adapter) + has_at_least_one_channel_input = True + elif isinstance(arg, InputAttributeNode): + input_adapter = DAGInputAdapter(arg, self.dag_input_channel) + resolved_args.append(input_adapter) + has_at_least_one_channel_input = True + elif isinstance(arg, DAGNode): # Other DAGNodes arg_idx = self.dag_node_to_idx[arg] arg_channel = self.idx_to_task[arg_idx].output_channel assert arg_channel is not None @@ -666,12 +781,6 @@ def _get_node_id(self): executable_tasks, ) - input_task = self.idx_to_task[self.input_task_idx] - # Register custom serializers for inputs provided to dag.execute(). - input_task.dag_node.type_hint.register_custom_serializer() - - self.dag_input_channel = input_task.output_channel - self.dag_output_channels = [] for output in self.idx_to_task[self.output_task_idx].args: assert isinstance(output, DAGNode) @@ -708,7 +817,6 @@ def _get_node_id(self): self._dag_submitter.start() self._dag_output_fetcher.start() - return def _monitor_failures(self): outer = self @@ -808,24 +916,17 @@ def execute( Args: args: Args to the InputNode. - kwargs: Kwargs to the InputNode. Not supported yet. + kwargs: Kwargs to the InputNode Returns: A list of Channels that can be used to read the DAG result. """ - # These errors should already be caught during compilation, but just in - # case. - if len(args) != 1: - raise NotImplementedError("Compiled DAGs support exactly one InputNode arg") - if len(kwargs) != 0: - raise NotImplementedError("Compiled DAGs do not support kwargs") - if self._enable_asyncio: raise ValueError("Use execute_async if enable_asyncio=True") self._get_or_compile() - inp = args[0] + inp = (args, kwargs) self._dag_submitter.write(inp) return self._dag_output_fetcher @@ -841,25 +942,17 @@ async def execute_async( Args: args: Args to the InputNode. - kwargs: Kwargs to the InputNode. Not supported yet. + kwargs: Kwargs to the InputNode. Returns: A list of Channels that can be used to read the DAG result. """ - # These errors should already be caught during compilation, but just in - # case. - if len(args) != 1: - raise NotImplementedError("Compiled DAGs support exactly one InputNode arg") - if len(kwargs) != 0: - raise NotImplementedError("Compiled DAGs do not support kwargs") - if not self._enable_asyncio: raise ValueError("Use execute if enable_asyncio=False") self._get_or_compile() async with self._dag_submission_lock: - inp = args[0] - + inp = (args, kwargs) await self._dag_submitter.write(inp) # Allocate a future that the caller can use to get the result. fut = asyncio.Future() diff --git a/python/ray/dag/tests/experimental/test_accelerated_dag.py b/python/ray/dag/tests/experimental/test_accelerated_dag.py index 2f6f9b4a2d46..1923deaafb56 100644 --- a/python/ray/dag/tests/experimental/test_accelerated_dag.py +++ b/python/ray/dag/tests/experimental/test_accelerated_dag.py @@ -71,6 +71,27 @@ def sleep(self, x): return x +@ray.remote +class Collector: + def __init__(self): + self.results = [] + + def collect(self, x): + self.results.append(x) + return self.results + + def collect_two(self, x, y): + self.results.append(x) + self.results.append(y) + return self.results + + def collect_three(self, x, y, z): + self.results.append(x) + self.results.append(y) + self.results.append(z) + return self.results + + def test_basic(ray_start_regular): a = Actor.remote(0) with InputNode() as i: @@ -175,6 +196,129 @@ def test_regular_args(ray_start_regular): compiled_dag.teardown() +def test_multi_args_basic(ray_start_regular): + a1 = Actor.remote(0) + a2 = Actor.remote(0) + c = Collector.remote() + with InputNode() as i: + branch1 = a1.inc.bind(i[0]) + branch2 = a2.inc.bind(i[1]) + dag = c.collect_two.bind(branch2, branch1) + + compiled_dag = dag.experimental_compile() + + output_channel = compiled_dag.execute(2, 3) + result = output_channel.begin_read() + assert result == [3, 2] + output_channel.end_read() + + compiled_dag.teardown() + + +def test_multi_args_single_actor(ray_start_regular): + c = Collector.remote() + with InputNode() as i: + dag = c.collect_two.bind(i[1], i[0]) + + compiled_dag = dag.experimental_compile() + + for i in range(3): + output_channel = compiled_dag.execute(2, 3) + result = output_channel.begin_read() + assert result == [3, 2] * (i + 1) + output_channel.end_read() + + compiled_dag.teardown() + + +def test_multi_args_branch(ray_start_regular): + a = Actor.remote(0) + c = Collector.remote() + with InputNode() as i: + branch = a.inc.bind(i[0]) + dag = c.collect_two.bind(branch, i[1]) + + compiled_dag = dag.experimental_compile() + + output_channel = compiled_dag.execute(2, 3) + result = output_channel.begin_read() + assert result == [2, 3] + output_channel.end_read() + + compiled_dag.teardown() + + +def test_kwargs_basic(ray_start_regular): + a1 = Actor.remote(0) + a2 = Actor.remote(0) + c = Collector.remote() + with InputNode() as i: + branch1 = a1.inc.bind(i.x) + branch2 = a2.inc.bind(i.y) + dag = c.collect_two.bind(branch2, branch1) + + compiled_dag = dag.experimental_compile() + + output_channel = compiled_dag.execute(x=2, y=3) + result = output_channel.begin_read() + assert result == [3, 2] + output_channel.end_read() + + compiled_dag.teardown() + + +def test_kwargs_single_actor(ray_start_regular): + c = Collector.remote() + with InputNode() as i: + dag = c.collect_two.bind(i.y, i.x) + + compiled_dag = dag.experimental_compile() + + for i in range(3): + output_channel = compiled_dag.execute(x=2, y=3) + result = output_channel.begin_read() + assert result == [3, 2] * (i + 1) + output_channel.end_read() + + compiled_dag.teardown() + + +def test_kwargs_branch(ray_start_regular): + a = Actor.remote(0) + c = Collector.remote() + with InputNode() as i: + branch = a.inc.bind(i.x) + dag = c.collect_two.bind(i.y, branch) + + compiled_dag = dag.experimental_compile() + + output_channel = compiled_dag.execute(x=2, y=3) + result = output_channel.begin_read() + assert result == [3, 2] + output_channel.end_read() + + compiled_dag.teardown() + + +def test_multi_args_and_kwargs(ray_start_regular): + a1 = Actor.remote(0) + a2 = Actor.remote(0) + c = Collector.remote() + with InputNode() as i: + branch1 = a1.inc.bind(i[0]) + branch2 = a2.inc.bind(i.y) + dag = c.collect_three.bind(branch2, i.z, branch1) + + compiled_dag = dag.experimental_compile() + + output_channel = compiled_dag.execute(2, y=3, z=4) + result = output_channel.begin_read() + assert result == [3, 4, 2] + output_channel.end_read() + + compiled_dag.teardown() + + @pytest.mark.parametrize("num_actors", [1, 4]) def test_scatter_gather_dag(ray_start_regular, num_actors): actors = [Actor.remote(0) for _ in range(num_actors)] @@ -252,17 +396,6 @@ def test_dag_errors(ray_start_regular): ): dag.experimental_compile() - with InputNode() as inp: - dag = a.inc.bind(inp) - dag2 = a.inc.bind(inp) - dag3 = a.inc_two.bind(dag, dag2) - with pytest.raises( - NotImplementedError, - match=r"Compiled DAGs currently do not support binding the same input " - "on the same actor multiple times.*", - ): - dag3.experimental_compile() - @ray.remote def f(x): return x @@ -275,24 +408,6 @@ def f(x): ): dag.experimental_compile() - with InputNode() as inp: - dag = a.inc_two.bind(inp[0], inp[1]) - with pytest.raises( - NotImplementedError, - match="Compiled DAGs currently do not support kwargs or multiple args " - "for InputNode", - ): - dag.experimental_compile() - - with InputNode() as inp: - dag = a.inc_two.bind(inp.x, inp.y) - with pytest.raises( - NotImplementedError, - match="Compiled DAGs currently do not support kwargs or multiple args " - "for InputNode", - ): - dag.experimental_compile() - def test_dag_fault_tolerance_chain(ray_start_regular_shared): actors = [ From d76518fe89a6877ed4e858b1d55746fd2a1e9e30 Mon Sep 17 00:00:00 2001 From: Jiajun Yao Date: Wed, 29 May 2024 14:01:54 -0700 Subject: [PATCH 45/65] [Core] Improve doc for --object-store-memory to describe how the default value is set (#45301) Currently it's unclear how the default value is set Signed-off-by: Jiajun Yao --- python/ray/_private/worker.py | 8 ++++++-- python/ray/scripts/scripts.py | 5 ++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/python/ray/_private/worker.py b/python/ray/_private/worker.py index cb6a8871d4b5..d4bba87fc4e8 100644 --- a/python/ray/_private/worker.py +++ b/python/ray/_private/worker.py @@ -1302,8 +1302,12 @@ def init( quantities for them available. labels: [Experimental] The key-value labels of the node. object_store_memory: The amount of memory (in bytes) to start the - object store with. By default, this is automatically set based on - available system memory. + object store with. + By default, this is 30% + (ray_constants.DEFAULT_OBJECT_STORE_MEMORY_PROPORTION) + of available system memory capped by + the shm size and 200G (ray_constants.DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES) + but can be set higher. local_mode: Deprecated: consider using the Ray Debugger instead. ignore_reinit_error: If true, Ray suppresses errors from calling ray.init() a second time. Ray won't be restarted. diff --git a/python/ray/scripts/scripts.py b/python/ray/scripts/scripts.py index 65158ea3f6c9..6ed5fabea435 100644 --- a/python/ray/scripts/scripts.py +++ b/python/ray/scripts/scripts.py @@ -367,7 +367,10 @@ def debug(address): required=False, type=int, help="The amount of memory (in bytes) to start the object store with. " - "By default, this is capped at 20GB but can be set higher.", + "By default, this is 30% (ray_constants.DEFAULT_OBJECT_STORE_MEMORY_PROPORTION) " + "of available system memory capped by " + "the shm size and 200G (ray_constants.DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES) " + "but can be set higher.", ) @click.option( "--redis-max-memory", From 3518518828de760f2ffb79f494ca6b26fb389bca Mon Sep 17 00:00:00 2001 From: Cuong Nguyen <128072568+can-anyscale@users.noreply.github.com> Date: Wed, 29 May 2024 14:25:58 -0700 Subject: [PATCH 46/65] [ci] do not clean up release test data (#45627) This code path deletes the release test working directory upon the job completion. We found repetitive cases where users want the data to be available for debugging purpose. Let's rely on s3 policy to clean up the data after a few days. Test: - CI Signed-off-by: can --- .../ray_release/command_runner/anyscale_job_runner.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/release/ray_release/command_runner/anyscale_job_runner.py b/release/ray_release/command_runner/anyscale_job_runner.py index 223ca7f093ea..893aa9df5081 100644 --- a/release/ray_release/command_runner/anyscale_job_runner.py +++ b/release/ray_release/command_runner/anyscale_job_runner.py @@ -349,9 +349,7 @@ def fetch_output(self) -> Dict[str, Any]: ) def cleanup(self): - try: - self.file_manager.delete(self.path_in_bucket, recursive=True) - except Exception: - # No big deal if we don't clean up, the bucket - # is set to automatically expire objects anyway - pass + # We piggy back on s3 retention policy for clean up instead of doing this + # ourselves. We find many cases where users want the data to be available + # for a short-while for debugging purpose. + pass From 0bb2600cbad5b5e4c7754a32f16f82133c83d943 Mon Sep 17 00:00:00 2001 From: Alan Guo Date: Wed, 29 May 2024 15:09:04 -0700 Subject: [PATCH 47/65] Fail gracefully when cluster status is not available yet. (#45620) --- .../src/components/AutoscalerStatusCards.tsx | 53 +++++++++++-------- dashboard/client/src/pages/job/JobDetail.tsx | 6 +-- .../src/pages/job/hook/useClusterStatus.ts | 4 +- .../src/pages/overview/OverviewPage.tsx | 6 +-- 4 files changed, 40 insertions(+), 29 deletions(-) diff --git a/dashboard/client/src/components/AutoscalerStatusCards.tsx b/dashboard/client/src/components/AutoscalerStatusCards.tsx index 80eeb9b73200..544e44467a79 100644 --- a/dashboard/client/src/components/AutoscalerStatusCards.tsx +++ b/dashboard/client/src/components/AutoscalerStatusCards.tsx @@ -2,38 +2,53 @@ import { Box, Typography } from "@mui/material"; import React from "react"; import { RayStatusResp } from "../service/status"; -const formatNodeStatus = (cluster_status: string) => { +const formatNodeStatus = (clusterStatus?: string) => { // ==== auto scaling status // Node status // .... // Resources // .... - const sections = cluster_status.split("Resources"); - return formatClusterStatus( - "Node Status", - sections[0].split("Node status")[1], - ); + if (!clusterStatus) { + return "No cluster status."; + } + try { + // Try to parse the node status. + const sections = clusterStatus.split("Resources"); + return formatClusterStatus( + "Node Status", + sections[0].split("Node status")[1], + ); + } catch (e) { + return "No cluster status."; + } }; -const formatResourcesStatus = (cluster_status: string) => { +const formatResourcesStatus = (clusterStatus?: string) => { // ==== auto scaling status // Node status // .... // Resources // .... - const sections = cluster_status.split("Resources"); - return formatClusterStatus("Resource Status", sections[1]); + if (!clusterStatus) { + return "No cluster status."; + } + try { + const sections = clusterStatus.split("Resources"); + return formatClusterStatus("Resource Status", sections[1]); + } catch (e) { + return "No cluster status."; + } }; -const formatClusterStatus = (title: string, cluster_status: string) => { - const cluster_status_rows = cluster_status.split("\n"); +const formatClusterStatus = (title: string, clusterStatus: string) => { + const clusterStatusRows = clusterStatus.split("\n"); return (
{title} - {cluster_status_rows.map((i, key) => { + {clusterStatusRows.map((i, key) => { // Format the output. // See format_info_string in util.py if (i.startsWith("-----") || i.startsWith("=====") || i === "") { @@ -54,10 +69,10 @@ const formatClusterStatus = (title: string, cluster_status: string) => { }; type StatusCardProps = { - cluster_status: RayStatusResp | undefined; + clusterStatus: RayStatusResp | undefined; }; -export const NodeStatusCard = ({ cluster_status }: StatusCardProps) => { +export const NodeStatusCard = ({ clusterStatus }: StatusCardProps) => { return ( { overflowY: "scroll", }} > - {cluster_status?.data && cluster_status?.data?.clusterStatus - ? formatNodeStatus(cluster_status?.data.clusterStatus) - : "No cluster status."} + {formatNodeStatus(clusterStatus?.data.clusterStatus)} ); }; -export const ResourceStatusCard = ({ cluster_status }: StatusCardProps) => { +export const ResourceStatusCard = ({ clusterStatus }: StatusCardProps) => { return ( { overflowY: "scroll", }} > - {cluster_status?.data && cluster_status?.data?.clusterStatus - ? formatResourcesStatus(cluster_status?.data.clusterStatus) - : "No cluster status."} + {formatResourcesStatus(clusterStatus?.data.clusterStatus)} ); }; diff --git a/dashboard/client/src/pages/job/JobDetail.tsx b/dashboard/client/src/pages/job/JobDetail.tsx index 1d7231e26544..8853bdf69071 100644 --- a/dashboard/client/src/pages/job/JobDetail.tsx +++ b/dashboard/client/src/pages/job/JobDetail.tsx @@ -55,7 +55,7 @@ export const JobDetailChartsPage = () => { const [actorListFilter, setActorListFilter] = useState(); const [actorTableExpanded, setActorTableExpanded] = useState(false); const actorTableRef = useRef(null); - const { cluster_status } = useRayStatus(); + const { clusterStatus } = useRayStatus(); const { data } = useSWR( job?.job_id ? ["useDataDatasets", job.job_id] : null, @@ -182,10 +182,10 @@ export const JobDetailChartsPage = () => { >
- +
- +
diff --git a/dashboard/client/src/pages/job/hook/useClusterStatus.ts b/dashboard/client/src/pages/job/hook/useClusterStatus.ts index 61ce605f14b0..ec34f1ca072b 100644 --- a/dashboard/client/src/pages/job/hook/useClusterStatus.ts +++ b/dashboard/client/src/pages/job/hook/useClusterStatus.ts @@ -3,7 +3,7 @@ import { API_REFRESH_INTERVAL_MS } from "../../../common/constants"; import { getRayStatus } from "../../../service/status"; export const useRayStatus = () => { - const { data: cluster_status } = useSWR( + const { data: clusterStatus } = useSWR( "useClusterStatus", async () => { try { @@ -19,6 +19,6 @@ export const useRayStatus = () => { ); return { - cluster_status, + clusterStatus, }; }; diff --git a/dashboard/client/src/pages/overview/OverviewPage.tsx b/dashboard/client/src/pages/overview/OverviewPage.tsx index 3065be699ec0..4a5b4455e367 100644 --- a/dashboard/client/src/pages/overview/OverviewPage.tsx +++ b/dashboard/client/src/pages/overview/OverviewPage.tsx @@ -52,7 +52,7 @@ const useStyles = makeStyles((theme) => export const OverviewPage = () => { const classes = useStyles(); - const { cluster_status } = useRayStatus(); + const { clusterStatus } = useRayStatus(); return (
@@ -80,7 +80,7 @@ export const OverviewPage = () => { classes.autoscalerCard, )} > - + { classes.autoscalerCard, )} > - +
} From dd7cbee11767a936a24617fea727f205b33680fb Mon Sep 17 00:00:00 2001 From: Cuong Nguyen <128072568+can-anyscale@users.noreply.github.com> Date: Wed, 29 May 2024 17:29:15 -0700 Subject: [PATCH 48/65] [ci] remove 3.8 from ray supported python list (#45622) Notice that we haven't removed this support completely once I work on upgrading python 3.12. Need to change some runtime environment to `oss-ci-base_build` since `forge` is using python 3.8. Test: - CI Signed-off-by: can --- .buildkite/lint.rayci.yml | 3 ++- python/setup.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.buildkite/lint.rayci.yml b/.buildkite/lint.rayci.yml index 1bb5266883d9..338584e4737f 100644 --- a/.buildkite/lint.rayci.yml +++ b/.buildkite/lint.rayci.yml @@ -24,9 +24,10 @@ steps: key: lint-medium instance_type: medium depends_on: - - forge + - oss-ci-base_build commands: - ./ci/lint/lint.sh {{matrix}} + job_env: oss-ci-base_build matrix: - api_annotations - api_discrepancy diff --git a/python/setup.py b/python/setup.py index 8c1d573f5109..22d2edaaa909 100644 --- a/python/setup.py +++ b/python/setup.py @@ -27,7 +27,7 @@ logger = logging.getLogger(__name__) -SUPPORTED_PYTHONS = [(3, 8), (3, 9), (3, 10), (3, 11)] +SUPPORTED_PYTHONS = [(3, 9), (3, 10), (3, 11)] # When the bazel version is updated, make sure to update it # in WORKSPACE file as well. From 5faf4761e9505aae7a83319630877eb99bba467d Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Wed, 29 May 2024 20:04:46 -0700 Subject: [PATCH 49/65] [data] Refactor resource manager (#45623) Refactor ResourceManager and avoid it directly depending on concrete operators. --------- Signed-off-by: Hao Chen --- .../execution/interfaces/physical_operator.py | 15 ++++ .../execution/operators/input_data_buffer.py | 3 + .../execution/operators/limit_operator.py | 3 + .../execution/operators/map_operator.py | 3 + .../execution/operators/output_splitter.py | 3 + .../_internal/execution/resource_manager.py | 87 +++++++++---------- 6 files changed, 66 insertions(+), 48 deletions(-) diff --git a/python/ray/data/_internal/execution/interfaces/physical_operator.py b/python/ray/data/_internal/execution/interfaces/physical_operator.py index 739262ef2b80..06a61e7b0b3d 100644 --- a/python/ray/data/_internal/execution/interfaces/physical_operator.py +++ b/python/ray/data/_internal/execution/interfaces/physical_operator.py @@ -426,3 +426,18 @@ def notify_in_task_submission_backpressure(self, in_backpressure: bool) -> None: def get_autoscaling_actor_pools(self) -> List[AutoscalingActorPool]: """Return a list of `AutoscalingActorPool`s managed by this operator.""" return [] + + def implements_accurate_memory_accounting(self) -> bool: + """Return whether this operator implements accurate memory accounting. + + An operator that implements accurate memory accounting should should properly + report its memory usage via the following APIs: + - `self._metrics.on_input_queued`. + - `self._metrics.on_input_dequeued`. + - `self._metrics.on_output_queued`. + - `self._metrics.on_output_dequeued`. + """ + # TODO(hchen): Currently we only enable `ReservationOpResourceAllocator` when + # all operators in the dataset have implemented accurate memory accounting. + # Eventually all operators should implement accurate memory accounting. + return False diff --git a/python/ray/data/_internal/execution/operators/input_data_buffer.py b/python/ray/data/_internal/execution/operators/input_data_buffer.py index cc0447d8d26e..1236093d4130 100644 --- a/python/ray/data/_internal/execution/operators/input_data_buffer.py +++ b/python/ray/data/_internal/execution/operators/input_data_buffer.py @@ -84,3 +84,6 @@ def _initialize_metadata(self): self._stats = { "input": block_metadata, } + + def implements_accurate_memory_accounting(self) -> bool: + return True diff --git a/python/ray/data/_internal/execution/operators/limit_operator.py b/python/ray/data/_internal/execution/operators/limit_operator.py index d021b3db112c..7db0590960da 100644 --- a/python/ray/data/_internal/execution/operators/limit_operator.py +++ b/python/ray/data/_internal/execution/operators/limit_operator.py @@ -120,3 +120,6 @@ def num_outputs_total(self) -> int: def throttling_disabled(self) -> bool: return True + + def implements_accurate_memory_accounting(self) -> bool: + return True diff --git a/python/ray/data/_internal/execution/operators/map_operator.py b/python/ray/data/_internal/execution/operators/map_operator.py index 0c8023ef5397..eb96b3622146 100644 --- a/python/ray/data/_internal/execution/operators/map_operator.py +++ b/python/ray/data/_internal/execution/operators/map_operator.py @@ -402,6 +402,9 @@ def base_resource_usage(self) -> ExecutionResources: def incremental_resource_usage(self) -> ExecutionResources: raise NotImplementedError + def implements_accurate_memory_accounting(self) -> bool: + return True + def _map_task( map_transformer: MapTransformer, diff --git a/python/ray/data/_internal/execution/operators/output_splitter.py b/python/ray/data/_internal/execution/operators/output_splitter.py index b99a9e0b6847..af7a3429d04e 100644 --- a/python/ray/data/_internal/execution/operators/output_splitter.py +++ b/python/ray/data/_internal/execution/operators/output_splitter.py @@ -241,6 +241,9 @@ def _get_location(self, bundle: RefBundle) -> Optional[NodeIdStr]: """ return bundle.get_cached_location() + def implements_accurate_memory_accounting(self) -> bool: + return True + def _split(bundle: RefBundle, left_size: int) -> Tuple[RefBundle, RefBundle]: left_blocks, left_meta = [], [] diff --git a/python/ray/data/_internal/execution/resource_manager.py b/python/ray/data/_internal/execution/resource_manager.py index 26a6cf3681ac..6b63f0bf0968 100644 --- a/python/ray/data/_internal/execution/resource_manager.py +++ b/python/ray/data/_internal/execution/resource_manager.py @@ -12,9 +12,6 @@ ) from ray.data._internal.execution.interfaces.physical_operator import PhysicalOperator from ray.data._internal.execution.operators.input_data_buffer import InputDataBuffer -from ray.data._internal.execution.operators.limit_operator import LimitOperator -from ray.data._internal.execution.operators.map_operator import MapOperator -from ray.data._internal.execution.operators.output_splitter import OutputSplitter from ray.data._internal.execution.util import memory_string from ray.data.context import DataContext @@ -42,15 +39,6 @@ class ResourceManager: # when `ReservationOpResourceAllocator` is not enabled. DEFAULT_OBJECT_STORE_MEMORY_LIMIT_FRACTION_WO_RESOURCE_RESERVATION = 0.25 - # Memory accounting is accurate only for these operators. - # We'll enable memory reservation if a dataset only contains these operators. - _ACCURRATE_MEMORY_ACCOUNTING_OPS = ( - InputDataBuffer, - MapOperator, - LimitOperator, - OutputSplitter, - ) - def __init__(self, topology: "Topology", options: ExecutionOptions): self._topology = topology self._options = options @@ -75,11 +63,11 @@ def __init__(self, topology: "Topology", options: ExecutionOptions): ctx = DataContext.get_current() if ctx.op_resource_reservation_enabled: - should_enable = True - for op in topology: - if not isinstance(op, ResourceManager._ACCURRATE_MEMORY_ACCOUNTING_OPS): - should_enable = False - break + # We'll enable memory reservation if all operators have + # implemented accurate memory accounting. + should_enable = all( + op.implements_accurate_memory_accounting() for op in topology + ) if should_enable: self._op_resource_allocator = ReservationOpResourceAllocator( self, ctx.op_resource_reservation_ratio @@ -213,7 +201,7 @@ def get_downstream_fraction(self, op: PhysicalOperator) -> float: """Return the downstream fraction of the given operator.""" return self._downstream_fraction[op] - def get_downstream_object_store_memory(self, op: PhysicalOperator) -> int: + def get_downstream_object_store_memory(self, op: PhysicalOperator) -> float: """Return the downstream object store memory usage of the given operator.""" return self._downstream_object_store_memory[op] @@ -240,7 +228,7 @@ def __init__(self, resource_manager: ResourceManager): self._resource_manager = resource_manager @abstractmethod - def update_usages(self) -> ExecutionResources: + def update_usages(self): """Callback to update resource usages.""" ... @@ -259,20 +247,22 @@ def max_task_output_bytes_to_read(self, op: PhysicalOperator) -> Optional[int]: class ReservationOpResourceAllocator(OpResourceAllocator): """An OpResourceAllocator implementation that reserves resources for each operator. - This class reserves memory and CPU resources for map operators, and consider runtime - resource usages to limit the resources that each operator can use. + This class reserves memory and CPU resources for eligible operators, and considers + runtime resource usages to limit the resources that each operator can use. It works in the following way: - 1. Currently we only limit map operators. Non-map operators are not throttled, but - their usage will be accounted for their upstream map operators. E.g., for such - a dataset "map1->limit->map2->streaming_split", we'll treat "map1->limit" as + 1. An operator is eligible for resource reservation, if it has enabled throttling + and hasn't completed. Ineligible operators are not throttled, but + their usage will be accounted for their upstream eligible operators. E.g., for + such a dataset "map1->limit->map2->streaming_split", we'll treat "map1->limit" as a group and "map2->streaming_split" as another group. - 2. For each map operator, we reserve `reservation_ratio * global_resources / - num_map_ops` resources, half of which is reserved only for the operator outputs, - excluding pending task outputs. + 2. For each eligible operator, we reserve `reservation_ratio * global_resources / + num_eligible_ops` resources, half of which is reserved only for the operator + outputs, excluding pending task outputs. 3. Non-reserved resources are shared among all operators. - 4. In each scheduling iteration, each map operator will get "remaining of their own - reserved resources" + "remaining of shared resources / num_map_ops" resources. + 4. In each scheduling iteration, each eligible operator will get "remaining of their + own reserved resources" + "remaining of shared resources / num_eligible_ops" + resources. The `reservation_ratio` is set to 50% by default. Users can tune this value to adjust how aggressive or conservative the resource allocation is. A higher value @@ -357,7 +347,7 @@ def __init__(self, resource_manager: ResourceManager, reservation_ratio: float): # the pending task outputs, and/or op's internal output buffers (the latter can # happen when `preserve_order=True`). # Then we'll have no budget to pull blocks from the op. - self._reserved_for_op_outputs: Dict[PhysicalOperator, int] = {} + self._reserved_for_op_outputs: Dict[PhysicalOperator, float] = {} # Total shared resources. self._total_shared = ExecutionResources.zero() # Resource budgets for each operator, excluding `_reserved_for_op_outputs`. @@ -374,12 +364,13 @@ def __init__(self, resource_manager: ResourceManager, reservation_ratio: float): self._idle_detector = self.IdleDetector() + def _is_op_eligible(self, op: PhysicalOperator) -> bool: + """Whether the op is eligible for memory reservation.""" + return not op.throttling_disabled() and not op.completed() + def _get_eligible_ops(self) -> List[PhysicalOperator]: - # Only consider map operators that are not completed. return [ - op - for op in self._resource_manager._topology - if isinstance(op, MapOperator) and not op.completed() + op for op in self._resource_manager._topology if self._is_op_eligible(op) ] def _update_reservation(self): @@ -472,7 +463,7 @@ def _should_unblock_streaming_output_backpressure( # launch tasks. Then we should temporarily unblock the streaming output # backpressure by allowing reading at least 1 block. So the current operator # can finish at least one task and yield resources to the downstream operators. - for next_op in self._get_downstream_map_ops(op): + for next_op in self._get_downstream_eligible_ops(op): if not self._reserved_min_resources[next_op]: # Case 1: the downstream operator hasn't reserved the minimum resources # to run at least one task. @@ -485,16 +476,16 @@ def _should_unblock_streaming_output_backpressure( return True return False - def _get_op_outputs_usage_with_downstream(self, op: PhysicalOperator) -> int: + def _get_op_outputs_usage_with_downstream(self, op: PhysicalOperator) -> float: """Get the outputs memory usage of the given operator, including the downstream - non-Map operators. + ineligible operators. """ # Outputs usage of the current operator. op_outputs_usage = self._resource_manager._mem_op_outputs[op] - # Also account the downstream non-Map operators' memory usage. + # Also account the downstream ineligible operators' memory usage. op_outputs_usage += sum( self._resource_manager.get_op_usage(next_op).object_store_memory - for next_op in self._get_downstream_non_map_ops(op) + for next_op in self._get_downstream_ineligible_ops(op) ) return op_outputs_usage @@ -511,35 +502,35 @@ def max_task_output_bytes_to_read(self, op: PhysicalOperator) -> Optional[int]: res = 1 return res - def _get_downstream_non_map_ops( + def _get_downstream_ineligible_ops( self, op: PhysicalOperator ) -> Iterable[PhysicalOperator]: - """Get the downstream non-Map operators of the given operator. + """Get the downstream ineligible operators of the given operator. E.g., - "cur_map->downstream_map" will return an empty list. - "cur_map->limit1->limit2->downstream_map" will return [limit1, limit2]. """ for next_op in op.output_dependencies: - if not isinstance(next_op, MapOperator): + if not self._is_op_eligible(next_op): yield next_op - yield from self._get_downstream_non_map_ops(next_op) + yield from self._get_downstream_ineligible_ops(next_op) - def _get_downstream_map_ops( + def _get_downstream_eligible_ops( self, op: PhysicalOperator ) -> Iterable[PhysicalOperator]: - """Get the downstream Map operators of the given operator, ignoring intermediate - non-Map operators. + """Get the downstream eligible operators of the given operator, ignoring intermediate + ineligible operators. E.g., - "cur_map->downstream_map" will return [downstream_map]. - "cur_map->limit1->limit2->downstream_map" will return [downstream_map]. """ for next_op in op.output_dependencies: - if isinstance(next_op, MapOperator): + if self._is_op_eligible(next_op): yield next_op else: - yield from self._get_downstream_map_ops(next_op) + yield from self._get_downstream_eligible_ops(next_op) def update_usages(self): self._update_reservation() From f2dfc37a374b8838019869dbd63928d9f73e52dd Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Wed, 29 May 2024 20:51:50 -0700 Subject: [PATCH 50/65] [Core] Fix race condition in setting node death info (#45619) Signed-off-by: Rui Qiao --- src/ray/gcs/gcs_server/gcs_node_manager.cc | 51 ++++++++----------- src/ray/gcs/gcs_server/gcs_node_manager.h | 15 ++---- .../test/gcs_actor_scheduler_test.cc | 12 +++-- .../test/gcs_autoscaler_state_manager_test.cc | 32 ++++++++++++ .../gcs_server/test/gcs_node_manager_test.cc | 6 ++- .../gcs_placement_group_scheduler_test.cc | 3 +- .../gcs_server/test/gcs_server_test_util.h | 16 +++++- src/ray/raylet/main.cc | 2 +- 8 files changed, 87 insertions(+), 50 deletions(-) diff --git a/src/ray/gcs/gcs_server/gcs_node_manager.cc b/src/ray/gcs/gcs_server/gcs_node_manager.cc index 1aa65a539ee3..6c04c4e1aede 100644 --- a/src/ray/gcs/gcs_server/gcs_node_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_node_manager.cc @@ -113,8 +113,7 @@ void GcsNodeManager::HandleUnregisterNode(rpc::UnregisterNodeRequest request, rpc::SendReplyCallback send_reply_callback) { NodeID node_id = NodeID::FromBinary(request.node_id()); RAY_LOG(DEBUG) << "HandleUnregisterNode() for node id = " << node_id; - SetDeathInfo(node_id, request.node_death_info()); - auto node = RemoveNode(node_id, /* is_intended = */ true); + auto node = RemoveNode(node_id, request.node_death_info()); if (!node) { RAY_LOG(INFO) << "Node " << node_id << " is already removed"; return; @@ -224,26 +223,9 @@ absl::optional> GcsNodeManager::GetAliveNode( return iter->second; } -void GcsNodeManager::SetDeathInfo(const NodeID &node_id, - const rpc::NodeDeathInfo &death_info) { - auto maybe_node = GetAliveNode(node_id); - if (!maybe_node.has_value()) { - return; - } - - auto node = std::move(maybe_node.value()); - auto node_death_info = node->mutable_death_info(); - node_death_info->CopyFrom(death_info); -} - rpc::NodeDeathInfo GcsNodeManager::InferDeathInfo(const NodeID &node_id) { - auto maybe_node = GetAliveNode(node_id); - RAY_CHECK(maybe_node.has_value()) - << "InferDeathInfo() should be called before node is removed"; - auto node = maybe_node.value(); auto iter = draining_nodes_.find(node_id); rpc::NodeDeathInfo death_info; - bool expect_force_termination; if (iter == draining_nodes_.end()) { expect_force_termination = false; @@ -288,7 +270,11 @@ void GcsNodeManager::SetNodeDraining( const NodeID &node_id, std::shared_ptr drain_request) { auto maybe_node = GetAliveNode(node_id); - RAY_CHECK(maybe_node.has_value()); + if (!maybe_node.has_value()) { + RAY_LOG(INFO) << "Skip setting node " << node_id << " to be draining, " + << "which is already removed"; + return; + } auto iter = draining_nodes_.find(node_id); if (iter == draining_nodes_.end()) { draining_nodes_.emplace(node_id, drain_request); @@ -303,16 +289,20 @@ void GcsNodeManager::SetNodeDraining( } std::shared_ptr GcsNodeManager::RemoveNode( - const ray::NodeID &node_id, bool is_intended /*= false*/) { + const ray::NodeID &node_id, const rpc::NodeDeathInfo &node_death_info) { std::shared_ptr removed_node; auto iter = alive_nodes_.find(node_id); if (iter != alive_nodes_.end()) { removed_node = std::move(iter->second); - auto death_info = removed_node->death_info(); + + // Set node death info. + auto death_info = removed_node->mutable_death_info(); + death_info->CopyFrom(node_death_info); + RAY_LOG(INFO) << "Removing node, node id = " << node_id << ", node name = " << removed_node->node_name() << ", death reason = " - << rpc::NodeDeathInfo_Reason_Name(death_info.reason()) - << ", death message = " << death_info.reason_message(); + << rpc::NodeDeathInfo_Reason_Name(death_info->reason()) + << ", death message = " << death_info->reason_message(); // Record stats that there's a new removed node. stats::NodeFailureTotal.Record(1); // Remove from alive nodes. @@ -320,7 +310,7 @@ std::shared_ptr GcsNodeManager::RemoveNode( node_map_.left.erase(node_id); // Remove from draining nodes if present. draining_nodes_.erase(node_id); - if (!is_intended) { + if (death_info->reason() == rpc::NodeDeathInfo::UNEXPECTED_TERMINATION) { // Broadcast a warning to all of the drivers indicating that the node // has been marked as dead. // TODO(rkn): Define this constant somewhere else. @@ -332,7 +322,7 @@ std::shared_ptr GcsNodeManager::RemoveNode( << " and node name: " << removed_node->node_name() << " has been marked dead because the detector" << " has missed too many heartbeats from it. This can happen when a " - "\t(1) raylet crashes unexpectedly (OOM, preempted node, etc.) \n" + "\t(1) raylet crashes unexpectedly (OOM, etc.) \n" << "\t(2) raylet has lagging heartbeats due to slow network or busy workload."; RAY_EVENT(ERROR, EL_RAY_NODE_REMOVED) .WithField("node_id", node_id.Hex()) @@ -354,11 +344,10 @@ std::shared_ptr GcsNodeManager::RemoveNode( void GcsNodeManager::OnNodeFailure(const NodeID &node_id, const StatusCallback &node_table_updated_callback) { - rpc::NodeDeathInfo death_info = InferDeathInfo(node_id); - SetDeathInfo(node_id, death_info); - bool is_expected_termination = - (death_info.reason() == rpc::NodeDeathInfo::AUTOSCALER_DRAIN_PREEMPTED); - if (auto node = RemoveNode(node_id, is_expected_termination)) { + auto maybe_node = GetAliveNode(node_id); + if (maybe_node.has_value()) { + rpc::NodeDeathInfo death_info = InferDeathInfo(node_id); + auto node = RemoveNode(node_id, death_info); node->set_state(rpc::GcsNodeInfo::DEAD); node->set_end_time_ms(current_sys_time_ms()); diff --git a/src/ray/gcs/gcs_server/gcs_node_manager.h b/src/ray/gcs/gcs_server/gcs_node_manager.h index 92035f6b9c46..edefa416cda9 100644 --- a/src/ray/gcs/gcs_server/gcs_node_manager.h +++ b/src/ray/gcs/gcs_server/gcs_node_manager.h @@ -109,13 +109,14 @@ class GcsNodeManager : public rpc::NodeInfoHandler { void SetNodeDraining(const NodeID &node_id, std::shared_ptr request); - /// Remove from alive nodes. + /// Remove a node from alive nodes. The node's death information will also be set. /// /// \param node_id The ID of the node to be removed. - /// \param is_intended False if this is triggered by `node_failure_detector_`, else - /// True. + /// \param node_death_info The node death info to set. + /// \return The removed node, with death info set. If the node is not found, return + /// nullptr. std::shared_ptr RemoveNode(const NodeID &node_id, - bool is_intended = false); + const rpc::NodeDeathInfo &node_death_info); /// Get alive node by ID. /// @@ -176,12 +177,6 @@ class GcsNodeManager : public rpc::NodeInfoHandler { /// \param node The node which is dead. void AddDeadNodeToCache(std::shared_ptr node); - /// Set the death info of the node. - /// - /// \param node_id The ID of the node. - /// \param death_info The death info of the node. - void SetDeathInfo(const NodeID &node_id, const rpc::NodeDeathInfo &death_info); - /// Infer death cause of the node based on existing draining requests. /// /// \param node_id The ID of the node. The node must not be removed diff --git a/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc b/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc index 3b3e432c130b..557a23c7fddc 100644 --- a/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc @@ -334,7 +334,8 @@ TEST_F(GcsActorSchedulerTest, TestNodeFailedWhenLeasing) { // Remove the node and cancel the scheduling on this node, the scheduling should be // interrupted. - gcs_node_manager_->RemoveNode(node_id); + rpc::NodeDeathInfo death_info; + gcs_node_manager_->RemoveNode(node_id, death_info); ASSERT_EQ(0, gcs_node_manager_->GetAllAliveNodes().size()); auto actor_ids = gcs_actor_scheduler_->CancelOnNode(node_id); ASSERT_EQ(1, actor_ids.size()); @@ -422,7 +423,8 @@ TEST_F(GcsActorSchedulerTest, TestNodeFailedWhenCreating) { // Remove the node and cancel the scheduling on this node, the scheduling should be // interrupted. - gcs_node_manager_->RemoveNode(node_id); + rpc::NodeDeathInfo death_info; + gcs_node_manager_->RemoveNode(node_id, death_info); ASSERT_EQ(0, gcs_node_manager_->GetAllAliveNodes().size()); auto actor_ids = gcs_actor_scheduler_->CancelOnNode(node_id); ASSERT_EQ(1, actor_ids.size()); @@ -934,7 +936,8 @@ TEST_F(GcsActorSchedulerTest, TestNodeFailedWhenLeasingByGcs) { // Remove the node and cancel the scheduling on this node, the scheduling should be // interrupted. - gcs_node_manager_->RemoveNode(node_id); + rpc::NodeDeathInfo death_info; + gcs_node_manager_->RemoveNode(node_id, death_info); ASSERT_EQ(0, gcs_node_manager_->GetAllAliveNodes().size()); auto actor_ids = gcs_actor_scheduler_->CancelOnNode(node_id); ASSERT_EQ(1, actor_ids.size()); @@ -1028,7 +1031,8 @@ TEST_F(GcsActorSchedulerTest, TestNodeFailedWhenCreatingByGcs) { // Remove the node and cancel the scheduling on this node, the scheduling should be // interrupted. - gcs_node_manager_->RemoveNode(node_id); + rpc::NodeDeathInfo death_info; + gcs_node_manager_->RemoveNode(node_id, death_info); ASSERT_EQ(0, gcs_node_manager_->GetAllAliveNodes().size()); auto actor_ids = gcs_actor_scheduler_->CancelOnNode(node_id); ASSERT_EQ(1, actor_ids.size()); diff --git a/src/ray/gcs/gcs_server/test/gcs_autoscaler_state_manager_test.cc b/src/ray/gcs/gcs_server/test/gcs_autoscaler_state_manager_test.cc index 1f3c5aca5a21..913cce5a300c 100644 --- a/src/ray/gcs/gcs_server/test/gcs_autoscaler_state_manager_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_autoscaler_state_manager_test.cc @@ -752,6 +752,38 @@ TEST_F(GcsAutoscalerStateManagerTest, TestDrainingStatus) { } } +TEST_F(GcsAutoscalerStateManagerTest, TestDrainNodeRaceCondition) { + auto node = Mocker::GenNodeInfo(); + + // Adding a node. + node->mutable_resources_total()->insert({"CPU", 2}); + node->mutable_resources_total()->insert({"GPU", 1}); + node->set_instance_id("instance_1"); + AddNode(node); + + rpc::autoscaler::DrainNodeRequest request; + request.set_node_id(node->node_id()); + request.set_reason(rpc::autoscaler::DrainNodeReason::DRAIN_NODE_REASON_PREEMPTION); + request.set_reason_message("preemption"); + request.set_deadline_timestamp_ms(std::numeric_limits::max()); + rpc::autoscaler::DrainNodeReply reply; + auto send_reply_callback = + [](ray::Status status, std::function f1, std::function f2) {}; + gcs_autoscaler_state_manager_->HandleDrainNode(request, &reply, send_reply_callback); + + // At this point, the GCS request is not accepted yet since ralyet has not replied. + ASSERT_FALSE(reply.is_accepted()); + + // Inject a race condition on GCS: remove the node before raylet accepts the request. + RemoveNode(node); + + // Simulates raylet accepts the drain request and replies to GCS. + ASSERT_TRUE(raylet_client_->ReplyDrainRaylet()); + + // The GCS request is accepted now. + ASSERT_TRUE(reply.is_accepted()); +} + TEST_F(GcsAutoscalerStateManagerTest, TestIdleTime) { auto node = Mocker::GenNodeInfo(); diff --git a/src/ray/gcs/gcs_server/test/gcs_node_manager_test.cc b/src/ray/gcs/gcs_server/test/gcs_node_manager_test.cc index 0424b209faa2..a8a0157e0d54 100644 --- a/src/ray/gcs/gcs_server/test/gcs_node_manager_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_node_manager_test.cc @@ -51,7 +51,8 @@ TEST_F(GcsNodeManagerTest, TestManagement) { node_manager.AddNode(node); ASSERT_EQ(node, node_manager.GetAliveNode(node_id).value()); - node_manager.RemoveNode(node_id); + rpc::NodeDeathInfo death_info; + node_manager.RemoveNode(node_id, death_info); ASSERT_TRUE(!node_manager.GetAliveNode(node_id).has_value()); } @@ -84,8 +85,9 @@ TEST_F(GcsNodeManagerTest, TestListener) { [&removed_nodes](std::shared_ptr node) { removed_nodes.emplace_back(std::move(node)); }); + rpc::NodeDeathInfo death_info; for (int i = 0; i < node_count; ++i) { - node_manager.RemoveNode(NodeID::FromBinary(added_nodes[i]->node_id())); + node_manager.RemoveNode(NodeID::FromBinary(added_nodes[i]->node_id()), death_info); } ASSERT_EQ(node_count, removed_nodes.size()); ASSERT_TRUE(node_manager.GetAllAliveNodes().empty()); diff --git a/src/ray/gcs/gcs_server/test/gcs_placement_group_scheduler_test.cc b/src/ray/gcs/gcs_server/test/gcs_placement_group_scheduler_test.cc index e6a53623d750..f94d3ac2a8b2 100644 --- a/src/ray/gcs/gcs_server/test/gcs_placement_group_scheduler_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_placement_group_scheduler_test.cc @@ -125,7 +125,8 @@ class GcsPlacementGroupSchedulerTest : public ::testing::Test { } void RemoveNode(const std::shared_ptr &node) { - gcs_node_manager_->RemoveNode(NodeID::FromBinary(node->node_id())); + rpc::NodeDeathInfo death_info; + gcs_node_manager_->RemoveNode(NodeID::FromBinary(node->node_id()), death_info); gcs_resource_manager_->OnNodeDead(NodeID::FromBinary(node->node_id())); } diff --git a/src/ray/gcs/gcs_server/test/gcs_server_test_util.h b/src/ray/gcs/gcs_server/test/gcs_server_test_util.h index 432ed655bf7f..fcd6b6bf6a3a 100644 --- a/src/ray/gcs/gcs_server/test/gcs_server_test_util.h +++ b/src/ray/gcs/gcs_server/test/gcs_server_test_util.h @@ -200,6 +200,19 @@ struct GcsServerMocker { } } + bool ReplyDrainRaylet() { + if (drain_raylet_callbacks.size() == 0) { + return false; + } else { + rpc::DrainRayletReply reply; + reply.set_is_accepted(true); + auto callback = drain_raylet_callbacks.front(); + callback(Status::OK(), reply); + drain_raylet_callbacks.pop_front(); + return true; + } + } + /// ResourceReserveInterface void PrepareBundleResources( const std::vector> &bundle_specs, @@ -304,7 +317,7 @@ struct GcsServerMocker { const rpc::ClientCallback &callback) override { rpc::DrainRayletReply reply; reply.set_is_accepted(true); - callback(Status::OK(), reply); + drain_raylet_callbacks.push_back(callback); }; void NotifyGCSRestart( @@ -319,6 +332,7 @@ struct GcsServerMocker { int num_release_unused_workers = 0; int num_get_task_failure_causes = 0; NodeID node_id = NodeID::FromRandom(); + std::list> drain_raylet_callbacks = {}; std::list> callbacks = {}; std::list> cancel_callbacks = {}; std::list> release_callbacks = {}; diff --git a/src/ray/raylet/main.cc b/src/ray/raylet/main.cc index 2942bd50ea4a..19d90124892a 100644 --- a/src/ray/raylet/main.cc +++ b/src/ray/raylet/main.cc @@ -450,7 +450,7 @@ int main(int argc, char *argv[]) { drain_request->reason() == ray::rpc::autoscaler::DrainNodeReason::DRAIN_NODE_REASON_PREEMPTION && drain_request->deadline_timestamp_ms() != 0 && - drain_request->deadline_timestamp_ms() < current_time_ms()) { + drain_request->deadline_timestamp_ms() < current_sys_time_ms()) { node_death_info.set_reason(ray::rpc::NodeDeathInfo::AUTOSCALER_DRAIN_PREEMPTED); node_death_info.set_reason_message(drain_request->reason_message()); } else { From 480b572d48d971cb0a379dd6b3bb3b0d7f21e5bc Mon Sep 17 00:00:00 2001 From: Alan Guo Date: Wed, 29 May 2024 22:03:57 -0700 Subject: [PATCH 51/65] add env var to allow dashboard http server to serve static files with symlinks (#45618) New env var is called RAY_DASHBOARD_BUILD_FOLLOW_SYMLINKS. This is an advanced setting that should only be used with special Ray installations where the dashboard build files are symlinked to a different directory. This is not recommended for most users and can pose a security risk. Please reference the aiohttp docs here: https://docs.aiohttp.org/en/stable/web_reference.html#aiohttp.web.UrlDispatcher.add_static --- dashboard/http_server_head.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/dashboard/http_server_head.py b/dashboard/http_server_head.py index 50d3e05473da..e1c427b1b288 100644 --- a/dashboard/http_server_head.py +++ b/dashboard/http_server_head.py @@ -28,6 +28,22 @@ logger = logging.getLogger(__name__) routes = dashboard_optional_utils.DashboardHeadRouteTable +# Env var that enables follow_symlinks for serving UI static files. +# This is an advanced setting that should only be used with special Ray installations +# where the dashboard build files are symlinked to a different directory. +# This is not recommended for most users and can pose a security risk. +# Please reference the aiohttp docs here: +# https://docs.aiohttp.org/en/stable/web_reference.html#aiohttp.web.UrlDispatcher.add_static +ENV_VAR_FOLLOW_SYMLINKS = "RAY_DASHBOARD_BUILD_FOLLOW_SYMLINKS" +FOLLOW_SYMLINKS_ENABLED = os.environ.get(ENV_VAR_FOLLOW_SYMLINKS) == "1" +if FOLLOW_SYMLINKS_ENABLED: + logger.warning( + "Enabling RAY_DASHBOARD_BUILD_FOLLOW_SYMLINKS is not recommended as it " + "allows symlinks to directories outside the dashboard build folder. " + "You may accidentally expose files on your system outside of the " + "build directory." + ) + def setup_static_dir(): build_dir = os.path.join( @@ -47,7 +63,7 @@ def setup_static_dir(): ) static_dir = os.path.join(build_dir, "static") - routes.static("/static", static_dir) + routes.static("/static", static_dir, follow_symlinks=FOLLOW_SYMLINKS_ENABLED) return build_dir From c94140a3a4b8ae9c0e6f1965d6285161189f9851 Mon Sep 17 00:00:00 2001 From: Sven Mika Date: Thu, 30 May 2024 08:50:54 +0200 Subject: [PATCH 52/65] [RLlib] Complete do-over of RLlib release tests (new API stack). (#45589) --- release/release_tests.yaml | 600 +----------------- .../multi_gpu_learning_tests.yaml | 237 ------- .../_old_multi_gpu_learning_tests/run.py | 35 - ...lti_gpu_with_attention_learning_tests.yaml | 191 ------ .../run.py | 35 - .../multi_gpu_with_lstm_learning_tests.yaml | 136 ---- .../run.py | 35 - .../_old_rllib_multi_gpu_tests.yaml | 92 --- release/rllib_tests/learning_tests/README.md | 21 - .../learning_tests/run_new_api_stack.py | 61 -- .../rllib_tests/learning_tests/tuned_examples | 1 + rllib/BUILD | 36 +- rllib/algorithms/algorithm.py | 2 +- rllib/env/multi_agent_env_runner.py | 6 - rllib/env/single_agent_env_runner.py | 2 +- rllib/tuned_examples/ppo/atari-ppo.yaml | 35 - rllib/tuned_examples/ppo/atari_ppo.py | 93 +++ .../ppo/cartpole-ppo-fake-gpus.yaml | 1 + .../ppo/cartpole-ppo-grid-search-example.yaml | 14 - .../ppo/cartpole-ppo-hyperband.yaml | 16 - rllib/tuned_examples/ppo/pong-ppo.yaml | 33 - .../tuned_examples/ppo/recomm-sys001-ppo.yaml | 49 -- rllib/utils/test_utils.py | 76 ++- 23 files changed, 167 insertions(+), 1640 deletions(-) delete mode 100644 release/rllib_tests/_old_multi_gpu_learning_tests/multi_gpu_learning_tests.yaml delete mode 100644 release/rllib_tests/_old_multi_gpu_learning_tests/run.py delete mode 100644 release/rllib_tests/_old_multi_gpu_with_attention_learning_tests/multi_gpu_with_attention_learning_tests.yaml delete mode 100644 release/rllib_tests/_old_multi_gpu_with_attention_learning_tests/run.py delete mode 100644 release/rllib_tests/_old_multi_gpu_with_lstm_learning_tests/multi_gpu_with_lstm_learning_tests.yaml delete mode 100644 release/rllib_tests/_old_multi_gpu_with_lstm_learning_tests/run.py delete mode 100644 release/rllib_tests/_old_rllib_multi_gpu_tests.yaml delete mode 100644 release/rllib_tests/learning_tests/README.md delete mode 100644 release/rllib_tests/learning_tests/run_new_api_stack.py create mode 120000 release/rllib_tests/learning_tests/tuned_examples delete mode 100644 rllib/tuned_examples/ppo/atari-ppo.yaml create mode 100644 rllib/tuned_examples/ppo/atari_ppo.py delete mode 100644 rllib/tuned_examples/ppo/cartpole-ppo-grid-search-example.yaml delete mode 100644 rllib/tuned_examples/ppo/cartpole-ppo-hyperband.yaml delete mode 100644 rllib/tuned_examples/ppo/pong-ppo.yaml delete mode 100644 rllib/tuned_examples/ppo/recomm-sys001-ppo.yaml diff --git a/release/release_tests.yaml b/release/release_tests.yaml index b285fb4c11d8..a9c01afbfc93 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -2740,473 +2740,13 @@ # ---------------------------------------------------------- # -------------------------- -# APPO (hybrid API stack) +# PPO # -------------------------- -- name: rllib_learning_tests_appo_hybrid_api_stack_torch +- name: rllib_learning_tests_pong_ppo_torch group: RLlib tests working_dir: rllib_tests - # Marking as unstable since it's currently expected to fail. - stable: false - - frequency: nightly - team: rllib - cluster: - byod: - type: gpu - post_build_script: byod_rllib_test.sh - runtime_env: - - RLLIB_TEST_NO_JAX_IMPORT=1 - - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin - cluster_compute: 4gpus_64cpus.yaml - - run: - timeout: 18000 - script: python learning_tests/run.py --yaml-sub-dir=appo/hybrid_stack --framework=torch - - alert: default - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: 4gpus_64cpus_gce.yaml - -- name: rllib_learning_tests_appo_hybrid_api_stack_tf2 - group: RLlib tests - working_dir: rllib_tests - stable: false - - frequency: nightly - team: rllib - cluster: - byod: - type: gpu - post_build_script: byod_rllib_test.sh - runtime_env: - - RLLIB_TEST_NO_JAX_IMPORT=1 - - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin - cluster_compute: 4gpus_64cpus.yaml - - run: - timeout: 18000 - script: python learning_tests/run.py --yaml-sub-dir=appo/hybrid_stack --framework=tf2 - - alert: default - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: 4gpus_64cpus_gce.yaml - -# -------------------------- -# APPO (old API stack) -# -------------------------- -- name: rllib_learning_tests_appo_old_api_stack_torch - group: RLlib tests - working_dir: rllib_tests - - # Marking as unstable since it's currently expected to fail. - stable: false - - frequency: nightly - team: rllib - cluster: - byod: - type: gpu - post_build_script: byod_rllib_test.sh - runtime_env: - - RLLIB_TEST_NO_JAX_IMPORT=1 - - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin - cluster_compute: 2gpus_32cpus.yaml - - run: - timeout: 18000 - script: python learning_tests/run.py --yaml-sub-dir=appo/old_stack --framework=torch - - alert: default - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: 2gpus_32cpus_gce.yaml - -- name: rllib_learning_tests_appo_old_api_stack_tf - group: RLlib tests - working_dir: rllib_tests - stable: False - - frequency: nightly - team: rllib - cluster: - byod: - type: gpu - post_build_script: byod_rllib_test.sh - runtime_env: - - RLLIB_TEST_NO_JAX_IMPORT=1 - - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin - cluster_compute: 2gpus_32cpus.yaml - - run: - timeout: 18000 - script: python learning_tests/run.py --yaml-sub-dir=appo/old_stack --framework=tf - - alert: default - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: 2gpus_32cpus_gce.yaml - - -# -------------------------- -# BC (hybrid API stack) -# -------------------------- -- name: rllib_learning_tests_bc_hybrid_api_stack_torch - group: RLlib tests - working_dir: rllib_tests - - frequency: nightly - team: rllib - - cluster: - byod: - type: gpu - post_build_script: byod_rllib_test.sh - runtime_env: - - RLLIB_TEST_NO_JAX_IMPORT=1 - - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin - cluster_compute: 1gpu_16cpus.yaml - - run: - timeout: 18000 - script: python learning_tests/run.py --yaml-sub-dir=bc --framework=torch - - alert: default - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: 1gpu_16cpus_gce.yaml - -- name: rllib_learning_tests_bc_hybrid_api_stack_tf2 - group: RLlib tests - working_dir: rllib_tests - stable: false - - frequency: nightly - team: rllib - - cluster: - byod: - type: gpu - post_build_script: byod_rllib_test.sh - runtime_env: - - RLLIB_TEST_NO_JAX_IMPORT=1 - - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin - cluster_compute: 1gpu_16cpus.yaml - - run: - timeout: 18000 - script: python learning_tests/run.py --yaml-sub-dir=bc --framework=tf2 - - alert: default - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: 1gpu_16cpus_gce.yaml - -# -------------------------- -# CQL (old API stack) -# -------------------------- -- name: rllib_learning_tests_cql_old_api_stack_torch - group: RLlib tests - working_dir: rllib_tests - - # Marking as unstable since it's currently expected to fail. - stable: false - - frequency: nightly - team: rllib - - cluster: - byod: - type: gpu - post_build_script: byod_rllib_test.sh - runtime_env: - - RLLIB_TEST_NO_JAX_IMPORT=1 - - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin - cluster_compute: 1gpu_16cpus.yaml - - run: - timeout: 18000 - script: python learning_tests/run.py --yaml-sub-dir=cql --framework=torch - - alert: default - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: 1gpu_16cpus_gce.yaml - -- name: rllib_learning_tests_cql_old_api_stack_tf - group: RLlib tests - working_dir: rllib_tests - - frequency: nightly - team: rllib - - # Marking as unstable since it's currently expected to fail. - stable: false - - cluster: - byod: - type: gpu - post_build_script: byod_rllib_test.sh - runtime_env: - - RLLIB_TEST_NO_JAX_IMPORT=1 - - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin - cluster_compute: 1gpu_16cpus.yaml - - run: - timeout: 18000 - script: python learning_tests/run.py --yaml-sub-dir=cql --framework=tf - - alert: default - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: 1gpu_16cpus_gce.yaml - -# -------------------------- -# DQN (old API stack) -# -------------------------- -- name: rllib_learning_tests_dqn_old_api_stack_torch - group: RLlib tests - working_dir: rllib_tests - - # Marking as unstable since it's currently expected to fail. - stable: false - - frequency: nightly - team: rllib - cluster: - byod: - type: gpu - post_build_script: byod_rllib_test.sh - runtime_env: - - RLLIB_TEST_NO_JAX_IMPORT=1 - - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin - cluster_compute: 1gpu_16cpus.yaml - - run: - timeout: 18000 - script: python learning_tests/run.py --yaml-sub-dir=dqn --framework=torch - - alert: default - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: 1gpu_16cpus_gce.yaml - -- name: rllib_learning_tests_dqn_old_api_stack_tf - group: RLlib tests - working_dir: rllib_tests - stable: False - - frequency: nightly - team: rllib - cluster: - byod: - type: gpu - post_build_script: byod_rllib_test.sh - runtime_env: - - RLLIB_TEST_NO_JAX_IMPORT=1 - - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin - cluster_compute: 1gpu_32cpus.yaml - - run: - timeout: 18000 - script: python learning_tests/run.py --yaml-sub-dir=dqn --framework=tf - - alert: default - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: 1gpu_16cpus_gce.yaml - -# -------------------------- -# IMPALA (old API stack) -# -------------------------- -- name: rllib_learning_tests_impala_old_api_stack_torch - group: RLlib tests - working_dir: rllib_tests - - frequency: nightly - team: rllib - cluster: - byod: - type: gpu - post_build_script: byod_rllib_test.sh - runtime_env: - - RLLIB_TEST_NO_JAX_IMPORT=1 - - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin - cluster_compute: 1gpu_16cpus.yaml - - run: - timeout: 18000 - script: python learning_tests/run.py --yaml-sub-dir=impala --framework=torch - - alert: default - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: 1gpu_16cpus_gce.yaml - -- name: rllib_learning_tests_impala_old_api_stack_tf - group: RLlib tests - working_dir: rllib_tests - stable: False - - frequency: nightly - team: rllib - cluster: - byod: - type: gpu - post_build_script: byod_rllib_test.sh - runtime_env: - - RLLIB_TEST_NO_JAX_IMPORT=1 - - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin - cluster_compute: 1gpu_16cpus.yaml - - run: - timeout: 18000 - script: python learning_tests/run.py --yaml-sub-dir=impala --framework=tf - - alert: default - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: 1gpu_16cpus_gce.yaml - - -# -------------------------- -# MARWIL (old API stack) -# -------------------------- -- name: rllib_learning_tests_marwil_old_api_stack_torch - group: RLlib tests - working_dir: rllib_tests - - # Marking as unstable since it's currently expected to fail. - stable: false - - frequency: nightly - team: rllib - cluster: - byod: - type: gpu - post_build_script: byod_rllib_test.sh - runtime_env: - - RLLIB_TEST_NO_JAX_IMPORT=1 - - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin - cluster_compute: 1gpu_16cpus.yaml - - run: - timeout: 18000 - script: python learning_tests/run.py --yaml-sub-dir=marwil --framework=torch - - alert: default - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: 1gpu_16cpus_gce.yaml - -- name: rllib_learning_tests_marwil_old_api_stack_tf - group: RLlib tests - working_dir: rllib_tests - - # Marking as unstable since it's currently expected to fail. - stable: false - - frequency: nightly - team: rllib - cluster: - byod: - type: gpu - post_build_script: byod_rllib_test.sh - runtime_env: - - RLLIB_TEST_NO_JAX_IMPORT=1 - - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin - cluster_compute: 1gpu_16cpus.yaml - - run: - timeout: 18000 - script: python learning_tests/run.py --yaml-sub-dir=marwil --framework=tf - - alert: default - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: 1gpu_16cpus_gce.yaml - - -# -------------------------- -# PPO (new API stack) -# -------------------------- -- name: rllib_learning_tests_ppo_new_api_stack_torch - group: RLlib tests - working_dir: rllib_tests - - stable: False + stable: true frequency: nightly team: rllib @@ -3220,8 +2760,8 @@ cluster_compute: 8gpus_96cpus.yaml run: - timeout: 18000 - script: python learning_tests/run_new_api_stack.py --config-dir=ppo/new_stack --framework=torch + timeout: 1800 + script: python learning_tests/tuned_examples/ppo/atari_ppo.py --enable-new-api-stack --env ALE/Pong-v5 --num-gpus=4 --num-env-runners=95 --as-test alert: default @@ -3233,136 +2773,6 @@ cluster: cluster_compute: 8gpus_96cpus_gce.yaml - -# -------------------------- -# PPO (old API stack) -# -------------------------- -- name: rllib_learning_tests_ppo_old_stack_torch - group: RLlib tests - working_dir: rllib_tests - - # Marking as unstable since it's currently expected to fail. - stable: false - - frequency: nightly - team: rllib - cluster: - byod: - type: gpu - post_build_script: byod_rllib_test.sh - runtime_env: - - RLLIB_TEST_NO_JAX_IMPORT=1 - - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin - cluster_compute: 2gpus_32cpus.yaml - - run: - timeout: 18000 - script: python learning_tests/run.py --yaml-sub-dir=ppo/old_stack --framework=torch - - alert: default - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: 2gpus_32cpus_gce.yaml - -- name: rllib_learning_tests_ppo_old_stack_tf - group: RLlib tests - working_dir: rllib_tests - frequency: nightly - team: rllib - stable: False - - cluster: - byod: - type: gpu - post_build_script: byod_rllib_test.sh - runtime_env: - - RLLIB_TEST_NO_JAX_IMPORT=1 - - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin - cluster_compute: 2gpus_32cpus.yaml - - run: - timeout: 18000 - script: python learning_tests/run.py --yaml-sub-dir=ppo/old_stack --framework=tf - - alert: default - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: 2gpus_32cpus_gce.yaml - -# -------------------------- -# SAC (old API stack) -# -------------------------- -- name: rllib_learning_tests_sac_tf - group: RLlib tests - working_dir: rllib_tests - stable: False - - frequency: nightly - team: rllib - cluster: - byod: - type: gpu - post_build_script: byod_rllib_test.sh - runtime_env: - - RLLIB_TEST_NO_JAX_IMPORT=1 - - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin - cluster_compute: 1gpu_16cpus.yaml - - run: - timeout: 18000 - script: python learning_tests/run.py --yaml-sub-dir=sac --framework=tf - - alert: default - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: 1gpu_16cpus_gce.yaml - -- name: rllib_learning_tests_sac_torch - group: RLlib tests - working_dir: rllib_tests - - frequency: nightly - team: rllib - - cluster: - byod: - type: gpu - post_build_script: byod_rllib_test.sh - runtime_env: - - RLLIB_TEST_NO_JAX_IMPORT=1 - - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin - cluster_compute: 1gpu_16cpus.yaml - - run: - timeout: 18000 - script: python learning_tests/run.py --yaml-sub-dir=sac --framework=torch - - alert: default - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: 1gpu_16cpus_gce.yaml - - ######################## # Core Nightly Tests ######################## diff --git a/release/rllib_tests/_old_multi_gpu_learning_tests/multi_gpu_learning_tests.yaml b/release/rllib_tests/_old_multi_gpu_learning_tests/multi_gpu_learning_tests.yaml deleted file mode 100644 index 8a312996532a..000000000000 --- a/release/rllib_tests/_old_multi_gpu_learning_tests/multi_gpu_learning_tests.yaml +++ /dev/null @@ -1,237 +0,0 @@ - -a2c-cartpole-v1: - env: CartPole-v1 - run: A2C - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - sampler_results/episode_reward_mean: 150.0 - timesteps_total: 500000 - stop: - time_total_s: 600 - config: - num_gpus: 2 - num_workers: 23 - lr: 0.001 - -appo-cartpole-v1-no-vtrace: - env: CartPole-v1 - run: APPO - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - sampler_results/episode_reward_mean: 150.0 - timesteps_total: 500000 - stop: - time_total_s: 600 - config: - vtrace: false - num_gpus: 2 - num_workers: 5 - lr: 0.0003 - observation_filter: MeanStdFilter - num_sgd_iter: 6 - vf_loss_coeff: 0.01 - model: - fcnet_hiddens: [32] - fcnet_activation: linear - vf_share_layers: true - # Double batch size (2 GPUs). - train_batch_size: 1000 - -appo-cartpole-v1-vtrace: - env: CartPole-v1 - run: APPO - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - sampler_results/episode_reward_mean: 150.0 - timesteps_total: 500000 - stop: - time_total_s: 600 - config: - num_gpus: 2 - num_workers: 5 - lr: 0.0003 - observation_filter: MeanStdFilter - num_sgd_iter: 6 - vf_loss_coeff: 0.01 - model: - fcnet_hiddens: [32] - fcnet_activation: linear - vf_share_layers: true - # Double batch size (2 GPUs). - train_batch_size: 1000 - -ddpg-repeat-after-me-env: - env: ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv - run: DDPG - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - sampler_results/episode_reward_mean: -50.0 - timesteps_total: 8000 - stop: - time_total_s: 600 - config: - env_config: - config: - continuous: true - repeat_delay: 0 - - num_gpus: 2 - num_workers: 0 - # Double batch size (2 GPUs). - train_batch_size: 512 - -dqn-cartpole-v1: - env: CartPole-v1 - run: DQN - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - sampler_results/episode_reward_mean: 150.0 - timesteps_total: 50000 - stop: - time_total_s: 600 - config: - num_gpus: 2 - num_workers: 0 - # Double batch size (2 GPUs). - train_batch_size: 64 - # Mimic tuned_example for DQN CartPole. - n_step: 3 - model: - fcnet_hiddens: [64] - fcnet_activation: linear - -impala-cartpole-v1: - env: CartPole-v1 - run: IMPALA - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - sampler_results/episode_reward_mean: 150.0 - timesteps_total: 500000 - stop: - time_total_s: 600 - config: - num_gpus: 2 - num_workers: 23 - # Double batch size (2 GPUs). - train_batch_size: 1000 - -pg-cartpole-v1: - env: CartPole-v1 - run: PG - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - sampler_results/episode_reward_mean: 130.0 - timesteps_total: 500000 - stop: - time_total_s: 600 - config: - num_gpus: 2 - num_workers: 23 - # Double batch size (2 GPUs). - train_batch_size: 400 - model: - fcnet_hiddens: [64] - fcnet_activation: linear - -ppo-cartpole-v1: - env: CartPole-v1 - run: PPO - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - sampler_results/episode_reward_mean: 150.0 - timesteps_total: 300000 - stop: - time_total_s: 600 - config: - num_gpus: 2 - num_workers: 23 - lr: 0.0003 - observation_filter: MeanStdFilter - num_sgd_iter: 6 - vf_loss_coeff: 0.01 - model: - fcnet_hiddens: [32] - fcnet_activation: linear - vf_share_layers: true - # Double batch size (2 GPUs). - train_batch_size: 8000 - -sac-repeat-after-me-env: - env: ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv - run: SAC - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - sampler_results/episode_reward_mean: 40.0 - timesteps_total: 4500 - stop: - time_total_s: 600 - config: - env_config: - config: - repeat_delay: 0 - num_gpus: 2 - num_workers: 0 - replay_buffer_config: - type: MultiAgentPrioritizedReplayBuffer - initial_alpha: 0.001 - # Double batch size (2 GPUs). - train_batch_size: 512 - num_steps_sampled_before_learning_starts: 0 - -sac-repeat-after-me-env-continuous: - env: ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv - run: SAC - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - sampler_results/episode_reward_mean: -50.0 - timesteps_total: 4500 - stop: - time_total_s: 600 - config: - env_config: - config: - continuous: true - repeat_delay: 0 - replay_buffer_config: - type: MultiAgentPrioritizedReplayBuffer - num_gpus: 2 - num_workers: 0 - initial_alpha: 0.001 - # Double batch size (2 GPUs). - train_batch_size: 512 - # start learning immediately - num_steps_sampled_before_learning_starts: 0 - - -simpleq-cartpole-v1: - env: CartPole-v1 - run: SimpleQ - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - sampler_results/episode_reward_mean: 150.0 - timesteps_total: 85000 - stop: - time_total_s: 600 - config: - num_gpus: 2 - num_workers: 0 - -td3-repeat-after-me-env: - env: ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv - run: TD3 - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - sampler_results/episode_reward_mean: -50.0 - timesteps_total: 25000 - stop: - time_total_s: 600 - config: - env_config: - config: - continuous: true - repeat_delay: 0 - - num_gpus: 2 - num_workers: 0 - # Double batch size (2 GPUs). - train_batch_size: 200 diff --git a/release/rllib_tests/_old_multi_gpu_learning_tests/run.py b/release/rllib_tests/_old_multi_gpu_learning_tests/run.py deleted file mode 100644 index eafe75a1396c..000000000000 --- a/release/rllib_tests/_old_multi_gpu_learning_tests/run.py +++ /dev/null @@ -1,35 +0,0 @@ -"""Multi-GPU learning tests for RLlib (torch and tf). -""" - -import json -import os -from pathlib import Path - -from ray.rllib.utils.test_utils import run_learning_tests_from_yaml - -if __name__ == "__main__": - # Get path of this very script to look for yaml files. - abs_yaml_path = Path(__file__).parent - print("abs_yaml_path={}".format(abs_yaml_path)) - - yaml_files = abs_yaml_path.rglob("*.yaml") - yaml_files = sorted( - map(lambda path: str(path.absolute()), yaml_files), reverse=True - ) - - # Run all tests in the found yaml files. - results = run_learning_tests_from_yaml(yaml_files) - - test_output_json = os.environ.get( - "TEST_OUTPUT_JSON", "/tmp/rllib_multi_gpu_learning_tests.json" - ) - with open(test_output_json, "wt") as f: - json.dump(results, f) - - if len(results["not_passed"]) > 0: - raise ValueError( - "Not all learning tests successfully learned the tasks.\n" - f"Results=\n{results}" - ) - else: - print("Ok.") diff --git a/release/rllib_tests/_old_multi_gpu_with_attention_learning_tests/multi_gpu_with_attention_learning_tests.yaml b/release/rllib_tests/_old_multi_gpu_with_attention_learning_tests/multi_gpu_with_attention_learning_tests.yaml deleted file mode 100644 index d29c1ed63982..000000000000 --- a/release/rllib_tests/_old_multi_gpu_with_attention_learning_tests/multi_gpu_with_attention_learning_tests.yaml +++ /dev/null @@ -1,191 +0,0 @@ - -appo-stateless-cartpole-no-vtrace: - env: ray.rllib.examples.env.stateless_cartpole.StatelessCartPole - run: APPO - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - sampler_results/episode_reward_mean: 150.0 - timesteps_total: 500000 - stop: - time_total_s: 600 - config: - vtrace: false - num_gpus: 2 - num_workers: 5 - lr: 0.0003 - observation_filter: MeanStdFilter - num_sgd_iter: 6 - vf_loss_coeff: 0.01 - model: - fcnet_hiddens: [32] - fcnet_activation: linear - vf_share_layers: true - # Test w/ GTrXL net. - use_attention: true - max_seq_len: 10 - attention_num_transformer_units: 1 - attention_dim: 32 - attention_memory_inference: 10 - attention_memory_training: 10 - attention_num_heads: 1 - attention_head_dim: 32 - attention_position_wise_mlp_dim: 32 - # Double batch size (2 GPUs). - train_batch_size: 1000 - -appo-stateless-cartpole-vtrace: - env: ray.rllib.examples.env.stateless_cartpole.StatelessCartPole - run: APPO - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - sampler_results/episode_reward_mean: 150.0 - timesteps_total: 500000 - stop: - time_total_s: 600 - config: - num_gpus: 2 - num_workers: 5 - lr: 0.0003 - observation_filter: MeanStdFilter - num_sgd_iter: 6 - vf_loss_coeff: 0.01 - model: - fcnet_hiddens: [32] - fcnet_activation: linear - vf_share_layers: true - # Test w/ GTrXL net. - use_attention: true - max_seq_len: 10 - attention_num_transformer_units: 1 - attention_dim: 32 - attention_memory_inference: 10 - attention_memory_training: 10 - attention_num_heads: 1 - attention_head_dim: 32 - attention_position_wise_mlp_dim: 32 - # Double batch size (2 GPUs). - train_batch_size: 1000 - -impala-stateless-cartpole: - env: ray.rllib.examples.env.stateless_cartpole.StatelessCartPole - run: IMPALA - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - sampler_results/episode_reward_mean: 150.0 - timesteps_total: 500000 - stop: - time_total_s: 600 - config: - num_gpus: 2 - num_workers: 23 - # Double batch size (2 GPUs). - train_batch_size: 1000 - - # Test w/ GTrXL net. - model: - use_attention: true - max_seq_len: 10 - attention_num_transformer_units: 1 - attention_dim: 32 - attention_memory_inference: 10 - attention_memory_training: 10 - attention_num_heads: 1 - attention_head_dim: 32 - attention_position_wise_mlp_dim: 32 - -pg-stateless-cartpole: - env: ray.rllib.examples.env.stateless_cartpole.StatelessCartPole - run: PG - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - sampler_results/episode_reward_mean: 130.0 - timesteps_total: 500000 - stop: - time_total_s: 600 - config: - num_gpus: 2 - num_workers: 23 - # Double batch size (2 GPUs). - train_batch_size: 400 - model: - fcnet_hiddens: [64] - fcnet_activation: linear - # Test w/ GTrXL net. - use_attention: true - max_seq_len: 10 - attention_num_transformer_units: 1 - attention_dim: 32 - attention_memory_inference: 10 - attention_memory_training: 10 - attention_num_heads: 1 - attention_head_dim: 32 - attention_position_wise_mlp_dim: 32 - -ppo-stateless-cartpole: - env: ray.rllib.examples.env.stateless_cartpole.StatelessCartPole - run: PPO - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - sampler_results/episode_reward_mean: 150.0 - timesteps_total: 200000 - stop: - time_total_s: 600 - config: - num_gpus: 2 - num_workers: 23 - lr: 0.0003 - observation_filter: MeanStdFilter - num_sgd_iter: 6 - vf_loss_coeff: 0.01 - model: - fcnet_hiddens: [32] - fcnet_activation: linear - vf_share_layers: true - # Test w/ GTrXL net. - use_attention: true - max_seq_len: 10 - attention_num_transformer_units: 1 - attention_dim: 32 - attention_memory_inference: 10 - attention_memory_training: 10 - attention_num_heads: 1 - attention_head_dim: 32 - attention_position_wise_mlp_dim: 32 - # Double batch size (2 GPUs). - train_batch_size: 8000 - _enable_new_api_stack: false - -# TODO (Kourosh): Activate these tests back when the new modeling stack is merged -# r2d2-stateless-cartpole: -# env: ray.rllib.examples.env.stateless_cartpole.StatelessCartPole -# run: R2D2 -# # Minimum reward and total ts (in given time_total_s) to pass this test. -# pass_criteria: -# sampler_results/episode_reward_mean: 150.0 -# timesteps_total: 130000 -# stop: -# time_total_s: 1200 -# config: -# num_gpus: 2 -# num_workers: 0 -# # R2D2 settings. -# burn_in: 20 -# zero_init_states: true -# lr: 0.0005 -# # Give some more time to explore. -# exploration_config: -# epsilon_timesteps: 50000 -# model: -# # Test w/ GTrXL net. -# use_attention: true -# max_seq_len: 20 -# attention_num_transformer_units: 1 -# attention_dim: 32 -# attention_memory_inference: 10 -# attention_memory_training: 10 -# attention_num_heads: 1 -# attention_head_dim: 32 -# attention_position_wise_mlp_dim: 32 -# # Use a very simple base-model. -# fcnet_hiddens: [64] -# fcnet_activation: linear diff --git a/release/rllib_tests/_old_multi_gpu_with_attention_learning_tests/run.py b/release/rllib_tests/_old_multi_gpu_with_attention_learning_tests/run.py deleted file mode 100644 index 4ed7bb58c416..000000000000 --- a/release/rllib_tests/_old_multi_gpu_with_attention_learning_tests/run.py +++ /dev/null @@ -1,35 +0,0 @@ -"""Multi-GPU + GTrXL (attention net) learning tests for RLlib (torch and tf). -""" - -import json -import os -from pathlib import Path - -from ray.rllib.utils.test_utils import run_learning_tests_from_yaml - -if __name__ == "__main__": - # Get path of this very script to look for yaml files. - abs_yaml_path = Path(__file__).parent - print("abs_yaml_path={}".format(abs_yaml_path)) - - yaml_files = abs_yaml_path.rglob("*.yaml") - yaml_files = sorted( - map(lambda path: str(path.absolute()), yaml_files), reverse=True - ) - - # Run all tests in the found yaml files. - results = run_learning_tests_from_yaml(yaml_files) - - test_output_json = os.environ.get( - "TEST_OUTPUT_JSON", "/tmp/rllib_multi_gpu_with_attention_learning_tests.json" - ) - with open(test_output_json, "wt") as f: - json.dump(results, f) - - if len(results["not_passed"]) > 0: - raise ValueError( - "Not all learning tests successfully learned the tasks.\n" - f"Results=\n{results}" - ) - else: - print("Ok.") diff --git a/release/rllib_tests/_old_multi_gpu_with_lstm_learning_tests/multi_gpu_with_lstm_learning_tests.yaml b/release/rllib_tests/_old_multi_gpu_with_lstm_learning_tests/multi_gpu_with_lstm_learning_tests.yaml deleted file mode 100644 index 4e973d556d63..000000000000 --- a/release/rllib_tests/_old_multi_gpu_with_lstm_learning_tests/multi_gpu_with_lstm_learning_tests.yaml +++ /dev/null @@ -1,136 +0,0 @@ - -a2c-stateless-cartpole: - env: ray.rllib.examples.env.stateless_cartpole.StatelessCartPole - run: A2C - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - sampler_results/episode_reward_mean: 150.0 - timesteps_total: 500000 - stop: - time_total_s: 600 - config: - num_gpus: 2 - num_workers: 23 - # Use a large train batch size to make sure mini batches work - # after split to 2 GPU towers. - train_batch_size: 200 - lr: 0.001 - # Test w/ LSTMs. - model: - use_lstm: true - -appo-stateless-cartpole-no-vtrace: - env: ray.rllib.examples.env.stateless_cartpole.StatelessCartPole - run: APPO - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - sampler_results/episode_reward_mean: 150.0 - timesteps_total: 500000 - stop: - time_total_s: 600 - config: - vtrace: false - num_gpus: 2 - num_workers: 5 - lr: 0.0003 - observation_filter: MeanStdFilter - num_sgd_iter: 6 - vf_loss_coeff: 0.01 - model: - fcnet_hiddens: [32] - fcnet_activation: linear - vf_share_layers: true - # Test w/ LSTMs. - use_lstm: true - # Double batch size (2 GPUs). - train_batch_size: 1000 - -appo-stateless-cartpole-vtrace: - env: ray.rllib.examples.env.stateless_cartpole.StatelessCartPole - run: APPO - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - sampler_results/episode_reward_mean: 150.0 - timesteps_total: 500000 - stop: - time_total_s: 600 - config: - num_gpus: 2 - num_workers: 5 - lr: 0.0003 - observation_filter: MeanStdFilter - num_sgd_iter: 6 - vf_loss_coeff: 0.01 - model: - fcnet_hiddens: [32] - fcnet_activation: linear - vf_share_layers: true - # Test w/ LSTMs. - use_lstm: true - # Double batch size (2 GPUs). - train_batch_size: 1000 - -impala-stateless-cartpole: - env: ray.rllib.examples.env.stateless_cartpole.StatelessCartPole - run: IMPALA - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - sampler_results/episode_reward_mean: 150.0 - timesteps_total: 500000 - stop: - time_total_s: 600 - config: - num_gpus: 2 - num_workers: 23 - # Double batch size (2 GPUs). - train_batch_size: 1000 - - # Test w/ LSTMs. - model: - use_lstm: true - -pg-stateless-cartpole: - env: ray.rllib.examples.env.stateless_cartpole.StatelessCartPole - run: PG - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - sampler_results/episode_reward_mean: 130.0 - timesteps_total: 500000 - stop: - time_total_s: 600 - config: - num_gpus: 2 - num_workers: 23 - # Double batch size (2 GPUs). - train_batch_size: 400 - model: - fcnet_hiddens: [64] - fcnet_activation: linear - # Test w/ LSTMs. - use_lstm: true - -ppo-stateless-cartpole: - env: ray.rllib.examples.env.stateless_cartpole.StatelessCartPole - run: PPO - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - sampler_results/episode_reward_mean: 150.0 - timesteps_total: 200000 - stop: - time_total_s: 600 - config: - num_gpus: 2 - num_workers: 23 - lr: 0.0003 - observation_filter: MeanStdFilter - num_sgd_iter: 6 - vf_loss_coeff: 0.01 - model: - fcnet_hiddens: [32] - fcnet_activation: linear - vf_share_layers: true - # Test w/ LSTMs. - use_lstm: true - # Double batch size (2 GPUs). - train_batch_size: 8000 - _enable_new_api_stack: false diff --git a/release/rllib_tests/_old_multi_gpu_with_lstm_learning_tests/run.py b/release/rllib_tests/_old_multi_gpu_with_lstm_learning_tests/run.py deleted file mode 100644 index 975a3cdd0bc3..000000000000 --- a/release/rllib_tests/_old_multi_gpu_with_lstm_learning_tests/run.py +++ /dev/null @@ -1,35 +0,0 @@ -"""Multi-GPU + LSTM learning tests for RLlib (torch and tf). -""" - -import json -import os -from pathlib import Path - -from ray.rllib.utils.test_utils import run_learning_tests_from_yaml - -if __name__ == "__main__": - # Get path of this very script to look for yaml files. - abs_yaml_path = Path(__file__).parent - print("abs_yaml_path={}".format(abs_yaml_path)) - - yaml_files = abs_yaml_path.rglob("*.yaml") - yaml_files = sorted( - map(lambda path: str(path.absolute()), yaml_files), reverse=True - ) - - # Run all tests in the found yaml files. - results = run_learning_tests_from_yaml(yaml_files) - - test_output_json = os.environ.get( - "TEST_OUTPUT_JSON", "/tmp/rllib_multi_gpu_with_lstm_learning_tests.json" - ) - with open(test_output_json, "wt") as f: - json.dump(results, f) - - if len(results["not_passed"]) > 0: - raise ValueError( - "Not all learning tests successfully learned the tasks.\n" - f"Results=\n{results}" - ) - else: - print("Ok.") diff --git a/release/rllib_tests/_old_rllib_multi_gpu_tests.yaml b/release/rllib_tests/_old_rllib_multi_gpu_tests.yaml deleted file mode 100644 index 831c4b5a3731..000000000000 --- a/release/rllib_tests/_old_rllib_multi_gpu_tests.yaml +++ /dev/null @@ -1,92 +0,0 @@ - -- name: rllib_multi_gpu_learning_tests - group: RLlib tests - working_dir: rllib_tests - stable: false - - frequency: nightly - team: rllib - cluster: - byod: - type: gpu - post_build_script: byod_rllib_test.sh - runtime_env: - - RLLIB_TEST_NO_JAX_IMPORT=1 - - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin - cluster_compute: 8gpus_96cpus.yaml - - run: - timeout: 7200 - script: python multi_gpu_learning_tests/run.py - - alert: default - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: 8gpus_96cpus_gce.yaml - -- name: rllib_multi_gpu_with_lstm_learning_tests - group: RLlib tests - working_dir: rllib_tests - stable: false - - frequency: nightly - team: rllib - cluster: - byod: - type: gpu - post_build_script: byod_rllib_test.sh - runtime_env: - - RLLIB_TEST_NO_JAX_IMPORT=1 - - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin - cluster_compute: 8gpus_96cpus.yaml - - run: - timeout: 7200 - script: python multi_gpu_with_lstm_learning_tests/run.py - - alert: default - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: 8gpus_96cpus_gce.yaml - -- name: rllib_multi_gpu_with_attention_learning_tests - group: RLlib tests - working_dir: rllib_tests - - frequency: nightly - team: rllib - - cluster: - byod: - type: gpu - post_build_script: byod_rllib_test.sh - runtime_env: - - RLLIB_TEST_NO_JAX_IMPORT=1 - - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin - cluster_compute: 8gpus_96cpus.yaml - - run: - timeout: 7200 - script: python multi_gpu_with_attention_learning_tests/run.py - - alert: default - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - # TODO(https://github.com/ray-project/ray/issues/34591) - # Revert to the comment below once ^ closed. - cluster_compute: 8gpus_96cpus_gce.yaml diff --git a/release/rllib_tests/learning_tests/README.md b/release/rllib_tests/learning_tests/README.md deleted file mode 100644 index 8f3db7f56abc..000000000000 --- a/release/rllib_tests/learning_tests/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# RLlib Hard Learning Test - -Test most important RLlib algorithms with hard enough tasks to prevent performance regression. - -Algorithms in this suite are split into multiple tests, so groups of tests can run in parallel. This is to ensure reasonable total runtime. - -All learning tests have ``stop`` and ``pass_criteria`` configured, where ``stop`` specifies a fixed test duration, and ``pass_criteria`` specified performance goals like ``minimum reward`` and ``minimum throughput``. - -Unlike normal tuned examples, these learning tests always run to the full specified test duration, and would NOT stop early when the ``pass_criteria`` is met. - -This is so they can serve better as performance regression tests: - -* By giving these tests more time, we get better idea of where they actually peak out (instead of simply stopping at a pre-specified reward). So we will have better ideas of minor peak performance regressions when they happen. -* By decoupling peak performance from ``pass_criteria``, we can specify a relatively conservative ``pass_criteria``, to avoid having flaky tests that pass and fail because of random fluctuations. -* These conservative passing thresholds help alert us when some algorithms are badly broken. -* Peak reward and throughput numbers gets save in DB, so we can see, hopefully step function, trends over time when we improve things. - -TODO: we don't see progress right now in the time series chart, if an algorithm learns faster, but to the same peak performance. -For that, we need to plot multiple lines at different percentage time mark. - -If you have any questions about these tests, ping jungong@. \ No newline at end of file diff --git a/release/rllib_tests/learning_tests/run_new_api_stack.py b/release/rllib_tests/learning_tests/run_new_api_stack.py deleted file mode 100644 index b04b28d2d6a0..000000000000 --- a/release/rllib_tests/learning_tests/run_new_api_stack.py +++ /dev/null @@ -1,61 +0,0 @@ -"""Learning regression tests on the new API stack for RLlib.""" - -import json -import os -from pathlib import Path - -from ray.rllib.utils.test_utils import run_learning_tests_from_yaml_or_py - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser() - parser.add_argument( - "--smoke-test", - action="store_true", - default=False, - help="Finish quickly for training.", - ) - parser.add_argument( - "--config-dir", - type=str, - default="", - help="Sub directory under yaml_files/ to look for .py config files.", - ) - parser.add_argument( - "--framework", - type=str, - default="tf", - help="The framework (tf|tf2|torch) to use.", - ) - args = parser.parse_args() - - assert args.config_dir, "--config-dir can't be empty." - - # Get path of this very script to look for yaml files. - abs_config_path = os.path.join( - str(Path(__file__).parent), "yaml_files", args.config_dir - ) - print("abs_config_path={}".format(abs_config_path)) - - py_files = Path(abs_config_path).rglob("*.py") - py_files = sorted(map(lambda path: str(path.absolute()), py_files), reverse=True) - - # Run all tests in the found yaml files. - results = run_learning_tests_from_yaml_or_py( - config_files=py_files, - framework=args.framework, - ) - - test_output_json = os.environ.get("TEST_OUTPUT_JSON", "/tmp/learning_test.json") - with open(test_output_json, "wt") as f: - json.dump(results, f) - - if len(results["not_passed"]) > 0: - raise ValueError( - "Not all learning tests successfully learned the tasks.\n" - f"Results=\n{results}" - ) - else: - print("Ok.") diff --git a/release/rllib_tests/learning_tests/tuned_examples b/release/rllib_tests/learning_tests/tuned_examples new file mode 120000 index 000000000000..c0b75244d04b --- /dev/null +++ b/release/rllib_tests/learning_tests/tuned_examples @@ -0,0 +1 @@ +../../../rllib/tuned_examples \ No newline at end of file diff --git a/rllib/BUILD b/rllib/BUILD index e6966fa341b6..72e97d846854 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -226,7 +226,7 @@ py_test( py_test( name = "learning_tests_cartpole_crashing_appo_old_api_stack", main = "tests/run_regression_tests.py", - tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "crashing_cartpole", "no_tf_static_graph"], + tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "crashing_cartpole", "torch_only"], size = "large", srcs = ["tests/run_regression_tests.py"], data = ["tuned_examples/appo/cartpole-crashing-recreate-workers-appo.py"], @@ -248,7 +248,7 @@ py_test( py_test( name = "learning_tests_multi_agent_cartpole_crashing_appo_old_api_stack", main = "tests/run_regression_tests.py", - tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "crashing_cartpole"], + tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "crashing_cartpole", "torch_only"], size = "large", srcs = ["tests/run_regression_tests.py"], data = ["tuned_examples/appo/multi-agent-cartpole-crashing-recreate-workers-appo.py"], @@ -259,7 +259,7 @@ py_test( py_test( name = "learning_tests_multi_agent_cartpole_crashing_and_stalling_appo_old_api_stack", main = "tests/run_regression_tests.py", - tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "crashing_cartpole"], + tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "crashing_cartpole", "torch_only"], size = "large", srcs = ["tests/run_regression_tests.py"], data = ["tuned_examples/appo/multi-agent-cartpole-crashing-and-stalling-recreate-workers-appo.py"], @@ -2400,26 +2400,6 @@ py_test( args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=900.0", "--num-cpus=6", "--evaluation-duration=10", "--evaluation-duration-unit=episodes"] ) -# @OldAPIStack -py_test( - name = "examples/evaluation/evaluation_parallel_to_training_13_episodes_tf_old_api_stack", - main = "examples/evaluation/evaluation_parallel_to_training.py", - tags = ["team:rllib", "exclusive", "examples"], - size = "medium", - srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], - args = ["--as-test", "--framework=tf", "--stop-reward=50.0", "--num-cpus=6", "--evaluation-duration=13"] -) - -# @OldAPIStack -py_test( - name = "examples/evaluation/evaluation_parallel_to_training_duration_auto_tf_old_api_stack", - main = "examples/evaluation/evaluation_parallel_to_training.py", - tags = ["team:rllib", "exclusive", "examples"], - size = "medium", - srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], - args = ["--as-test", "--stop-reward=50.0", "--framework=tf", "--num-cpus=6", "--evaluation-duration=auto"] -) - # @OldAPIStack py_test( name = "examples/evaluation/evaluation_parallel_to_training_duration_auto_torch_old_api_stack", @@ -2430,16 +2410,6 @@ py_test( args = ["--as-test", "--stop-reward=50.0", "--num-cpus=6", "--evaluation-duration=auto"] ) -# @OldAPIStack -py_test( - name = "examples/evaluation/evaluation_parallel_to_training_duration_auto_tf2_old_api_stack", - main = "examples/evaluation/evaluation_parallel_to_training.py", - tags = ["team:rllib", "exclusive", "examples"], - size = "medium", - srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], - args = ["--as-test", "--framework=tf2", "--stop-reward=30.0", "--num-cpus=6", "--evaluation-num-env-runners=3", "--evaluation-duration=auto"] -) - # @OldAPIStack py_test( name = "examples/evaluation/evaluation_parallel_to_training_211_ts_torch_old_api_stack", diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index 6b71fd59dd0c..c0a8b3fecd29 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -542,7 +542,7 @@ def default_logger_creator(config): # (although their values may be nan), so that Tune doesn't complain # when we use these as stopping criteria. self.evaluation_metrics = { - "evaluation": { + EVALUATION_RESULTS: { ENV_RUNNER_RESULTS: { EPISODE_RETURN_MAX: np.nan, EPISODE_RETURN_MIN: np.nan, diff --git a/rllib/env/multi_agent_env_runner.py b/rllib/env/multi_agent_env_runner.py index e179d047cfc9..f374f45b35e6 100644 --- a/rllib/env/multi_agent_env_runner.py +++ b/rllib/env/multi_agent_env_runner.py @@ -623,12 +623,6 @@ def get_metrics(self) -> ResultDict: module_episode_returns, ) - # TODO (simon): This results in hundreds of warnings in the logs - # b/c reducing over NaNs is not supported. - # # If no episodes at all, log NaN stats. - # if len(self._done_episodes_for_metrics) == 0: - # self._log_episode_metrics(np.nan, np.nan, np.nan) - # Log num episodes counter for this iteration. self.metrics.log_value( NUM_EPISODES, diff --git a/rllib/env/single_agent_env_runner.py b/rllib/env/single_agent_env_runner.py index e18e32d7010a..5f708fa600a7 100644 --- a/rllib/env/single_agent_env_runner.py +++ b/rllib/env/single_agent_env_runner.py @@ -802,7 +802,7 @@ def _increase_sampled_metrics(self, num_steps): def _log_episode_metrics(self, length, ret, sec): # Log general episode metrics. - # To mimick the old API stack behavior, we'll use `window` here for + # To mimic the old API stack behavior, we'll use `window` here for # these particular stats (instead of the default EMA). win = self.config.metrics_num_episodes_for_smoothing self.metrics.log_value(EPISODE_LEN_MEAN, length, window=win) diff --git a/rllib/tuned_examples/ppo/atari-ppo.yaml b/rllib/tuned_examples/ppo/atari-ppo.yaml deleted file mode 100644 index 44f76cae0e26..000000000000 --- a/rllib/tuned_examples/ppo/atari-ppo.yaml +++ /dev/null @@ -1,35 +0,0 @@ -# Runs on a single g3.16xl node -# See https://github.com/ray-project/rl-experiments for results -atari-ppo: - env: - grid_search: - - ALE/Breakout-v5 - - ALE/BeamRider-v5 - - ALE/Qbert-v5 - - ALE/SpaceInvaders-v5 - run: PPO - config: - # Works for both torch and tf. - framework: torch - # Make analogous to old v4 + NoFrameskip. - env_config: - frameskip: 1 - full_action_space: false - repeat_action_probability: 0.0 - lambda: 0.95 - kl_coeff: 0.5 - clip_rewards: True - clip_param: 0.1 - vf_clip_param: 10.0 - entropy_coeff: 0.01 - train_batch_size: 5000 - rollout_fragment_length: 100 - sgd_minibatch_size: 500 - num_sgd_iter: 10 - num_env_runners: 10 - num_envs_per_env_runner: 5 - batch_mode: truncate_episodes - observation_filter: NoFilter - model: - vf_share_layers: true - num_gpus: 1 diff --git a/rllib/tuned_examples/ppo/atari_ppo.py b/rllib/tuned_examples/ppo/atari_ppo.py new file mode 100644 index 000000000000..ee76d8d3f9ce --- /dev/null +++ b/rllib/tuned_examples/ppo/atari_ppo.py @@ -0,0 +1,93 @@ +import gymnasium as gym + +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.connectors.env_to_module.frame_stacking import FrameStackingEnvToModule +from ray.rllib.connectors.learner.frame_stacking import FrameStackingLearner +from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + NUM_ENV_STEPS_SAMPLED_LIFETIME, +) +from ray.rllib.utils.test_utils import add_rllib_example_script_args +from ray import tune + + +parser = add_rllib_example_script_args() +# Use `parser` to add your own custom command line options to this script +# and (if needed) use their values toset up `config` below. +args = parser.parse_args() + + +def _make_env_to_module_connector(env): + return FrameStackingEnvToModule(num_frames=4) + + +def _make_learner_connector(input_observation_space, input_action_space): + return FrameStackingLearner(num_frames=4) + + +# Create a custom Atari setup (w/o the usual RLlib-hard-coded framestacking in it). +# We would like our frame stacking connector to do this job. +def _env_creator(cfg): + return wrap_atari_for_new_api_stack( + gym.make(args.env, **cfg), + # Perform frame-stacking through ConnectorV2 API. + framestack=None, + ) + + +tune.register_env("env", _env_creator) + + +config = ( + PPOConfig() + .environment( + "env", + env_config={ + # Make analogous to old v4 + NoFrameskip. + "frameskip": 1, + "full_action_space": False, + "repeat_action_probability": 0.0, + }, + clip_rewards=True, + ) + .env_runners( + # num_envs_per_env_runner=5, # 5 on old yaml example + env_to_module_connector=_make_env_to_module_connector, + ) + .training( + learner_connector=_make_learner_connector, + train_batch_size_per_learner=4000, # 5000 on old yaml example + mini_batch_size_per_learner=128, # 500 on old yaml example + lambda_=0.95, + kl_coeff=0.5, + clip_param=0.1, + vf_clip_param=10.0, + entropy_coeff=0.01, + num_sgd_iter=10, + lr=0.00015 * args.num_gpus, + grad_clip=100.0, + grad_clip_by="global_norm", + ) + .rl_module( + model_config_dict={ + "vf_share_layers": True, + "conv_filters": [[16, 4, 2], [32, 4, 2], [64, 4, 2], [128, 4, 2]], + "conv_activation": "relu", + "post_fcnet_hiddens": [256], + "uses_new_env_runners": True, + } + ) +) + +stop = { + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 20.0, + NUM_ENV_STEPS_SAMPLED_LIFETIME: 1500000, +} + + +if __name__ == "__main__": + from ray.rllib.utils.test_utils import run_rllib_example_script_experiment + + run_rllib_example_script_experiment(config, args=args, stop=stop) diff --git a/rllib/tuned_examples/ppo/cartpole-ppo-fake-gpus.yaml b/rllib/tuned_examples/ppo/cartpole-ppo-fake-gpus.yaml index af5b338997be..b4815746cf8c 100644 --- a/rllib/tuned_examples/ppo/cartpole-ppo-fake-gpus.yaml +++ b/rllib/tuned_examples/ppo/cartpole-ppo-fake-gpus.yaml @@ -1,3 +1,4 @@ +# @OldAPIStack cartpole-ppo-fake-gpus: env: CartPole-v1 run: PPO diff --git a/rllib/tuned_examples/ppo/cartpole-ppo-grid-search-example.yaml b/rllib/tuned_examples/ppo/cartpole-ppo-grid-search-example.yaml deleted file mode 100644 index 594aa27182ac..000000000000 --- a/rllib/tuned_examples/ppo/cartpole-ppo-grid-search-example.yaml +++ /dev/null @@ -1,14 +0,0 @@ -cartpole-ppo-grid-search-example: - env: CartPole-v1 - run: PPO - stop: - env_runners/episode_return_mean: 200 - time_total_s: 180 - config: - # Works for both torch and tf. - framework: torch - num_env_runners: 2 - num_sgd_iter: - grid_search: [1, 4] - sgd_minibatch_size: - grid_search: [128, 256, 512] diff --git a/rllib/tuned_examples/ppo/cartpole-ppo-hyperband.yaml b/rllib/tuned_examples/ppo/cartpole-ppo-hyperband.yaml deleted file mode 100644 index 2d76a3e98eb6..000000000000 --- a/rllib/tuned_examples/ppo/cartpole-ppo-hyperband.yaml +++ /dev/null @@ -1,16 +0,0 @@ -cartpole-ppo: - env: CartPole-v1 - run: PPO - num_samples: 3 - stop: - env_runners/episode_return_mean: 200 - time_total_s: 180 - config: - # Works for both torch and tf. - framework: torch - num_env_runners: 1 - num_sgd_iter: - grid_search: [1, 4] - sgd_minibatch_size: - grid_search: [128, 256, 512] - observation_filter: MeanStdFilter diff --git a/rllib/tuned_examples/ppo/pong-ppo.yaml b/rllib/tuned_examples/ppo/pong-ppo.yaml deleted file mode 100644 index 8e1419024ae2..000000000000 --- a/rllib/tuned_examples/ppo/pong-ppo.yaml +++ /dev/null @@ -1,33 +0,0 @@ -# On a single GPU, this achieves maximum reward in ~15-20 minutes. -# -# $ python train.py -f tuned_configs/pong-ppo.yaml -# -pong-ppo: - env: ALE/Pong-v5 - run: PPO - config: - # Works for both torch and tf. - framework: torch - # Make analogous to old v4 + NoFrameskip. - env_config: - frameskip: 1 - full_action_space: false - repeat_action_probability: 0.0 - lambda: 0.95 - kl_coeff: 0.5 - clip_rewards: True - clip_param: 0.1 - vf_clip_param: 10.0 - entropy_coeff: 0.01 - train_batch_size: 5000 - rollout_fragment_length: auto - sgd_minibatch_size: 500 - num_sgd_iter: 10 - num_env_runners: 32 - num_envs_per_env_runner: 5 - batch_mode: truncate_episodes - observation_filter: NoFilter - num_gpus: 1 - model: - dim: 42 - vf_share_layers: true diff --git a/rllib/tuned_examples/ppo/recomm-sys001-ppo.yaml b/rllib/tuned_examples/ppo/recomm-sys001-ppo.yaml deleted file mode 100644 index 937da8c4b547..000000000000 --- a/rllib/tuned_examples/ppo/recomm-sys001-ppo.yaml +++ /dev/null @@ -1,49 +0,0 @@ -recomm-sys001-ppo: - env: ray.rllib.examples.envs.classes.recommender_system_envs.RecommSys001 - run: PPO - stop: - #evaluation/env_runners/episode_return_mean: 48.0 - timesteps_total: 200000 - config: - framework: torch - - metrics_num_episodes_for_smoothing: 1000 - - # Env c'tor kwargs: - env_config: - # Number of different categories a doc can have and a user can - # have a preference for. - num_categories: 2 - # Number of docs to choose (a slate) from each timestep. - num_docs_to_select_from: 10 - # Slate size. - slate_size: 1 - # Re-sample docs each timesteps. - num_docs_in_db: 100 - # Re-sample user each episode. - num_users_in_db: 100 - # User time budget (determines lengths of episodes). - user_time_budget: 60.0 - - # Larger networks seem to help (large obs/action spaces). - model: - fcnet_hiddens: [256, 256] - - # Larger batch sizes seem to help (more stability, even with higher lr). - #train_batch_size: 32 - - #num_env_runners: 2 - #num_gpus: 0 - - #lr_choice_model: 0.002 - #lr_q_model: 0.002 - - #target_network_update_freq: 500 - #tau: 1.0 - - # Evaluation settings. - evaluation_interval: 1 - evaluation_num_env_runners: 4 - evaluation_duration: 200 - evaluation_duration_unit: episodes - evaluation_parallel_to_training: true diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index e6bd99c11cfb..6fe3f8069d43 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -6,6 +6,7 @@ from gymnasium.spaces import Dict as GymDict from gymnasium.spaces import Tuple as GymTuple import inspect +import json import logging import numpy as np import os @@ -220,6 +221,14 @@ def add_rllib_example_script_args( "be achieved within --stop-timesteps AND --stop-iters, otherwise this " "script will throw an exception at the end.", ) + parser.add_argument( + "--as-release-test", + action="store_true", + help="Whether this script should be run as a release test. If set, " + "all that applies to the --as-test option is true, plus, a short JSON summary " + "will be written into a results file whose location is given by the ENV " + "variable `TEST_OUTPUT_JSON`.", + ) # Learner scaling options. # Old API stack: config.num_gpus. @@ -630,8 +639,8 @@ def check_learning_achieved( Raises: ValueError: If `min_reward` not reached. """ - # Get maximum reward of all trials - # (check if at least one trial achieved some learning) + # Get maximum value of `metrics` over all trials + # (check if at least one trial achieved some learning, not just the final one). recorded_values = [] for _, row in tune_results.get_dataframe().iterrows(): if evaluation or ( @@ -1373,11 +1382,15 @@ def run_rllib_example_script_experiment( parser = add_rllib_example_script_args() args = parser.parse_args() + # If run --as-release-test, --as-test must also be set. + if args.as_release_test: + args.as_test = True + # Initialize Ray. ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode) # Define one or more stopping criteria. - if not stop: + if stop is None: stop = { f"{ENV_RUNNER_RESULTS}/episode_return_mean": args.stop_reward, f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": args.stop_timesteps, @@ -1391,8 +1404,8 @@ def run_rllib_example_script_experiment( # Set the framework. config.framework(args.framework) - # Add an env specifier? - if args.env is not None: + # Add an env specifier (only if not already set in config)? + if args.env is not None and config.env is None: config.environment(args.env) # Enable the new API stack? @@ -1423,6 +1436,7 @@ def run_rllib_example_script_experiment( # Run the experiment w/o Tune (directly operate on the RLlib Algorithm object). if args.no_tune: + assert not args.as_test and not args.as_release_test algo = config.build() for _ in range(stop.get(TRAINING_ITERATION, args.stop_iters)): results = algo.train() @@ -1443,7 +1457,9 @@ def run_rllib_example_script_experiment( break if val is not None and not np.isnan(val) and val >= threshold: print(f"Stop criterium ({key}={threshold}) fulfilled!") + ray.shutdown() return results + ray.shutdown() return results @@ -1490,6 +1506,7 @@ def run_rllib_example_script_experiment( os.environ["RAY_AIR_NEW_OUTPUT"] = "0" # Run the actual experiment (using Tune). + start_time = time.time() results = tune.Tuner( trainable or config.algo_class, param_space=config, @@ -1505,30 +1522,61 @@ def run_rllib_example_script_experiment( ), tune_config=tune.TuneConfig(num_samples=args.num_samples), ).fit() + time_taken = time.time() - start_time + + ray.shutdown() # If run as a test, check whether we reached the specified success criteria. + test_passed = False if args.as_test: # Success metric not provided, try extracting it from `stop`. if success_metric is None: for try_it in [ - f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/episode_return_mean", - f"{ENV_RUNNER_RESULTS}/episode_return_mean", + f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}", + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}", ]: if try_it in stop: success_metric = {try_it: stop[try_it]} break if success_metric is None: success_metric = { - f"{ENV_RUNNER_RESULTS}/episode_return_mean": args.stop_reward, + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, } # TODO (sven): Make this work for more than one metric (AND-logic?). - metric = next(iter(success_metric.keys())) - check_learning_achieved( - tune_results=results, - metric=metric, - min_value=success_metric[metric], + # Get maximum value of `metric` over all trials + # (check if at least one trial achieved some learning, not just the final one). + success_metric_key, success_metric_value = next(iter(success_metric.items())) + best_value = max( + row[success_metric_key] for _, row in results.get_dataframe().iterrows() ) - ray.shutdown() + if best_value >= success_metric_value: + test_passed = True + print(f"`{success_metric_key}` of {success_metric_value} reached! ok") + + if args.as_release_test: + trial = results._experiment_analysis.trials[0] + stats = trial.last_result + stats.pop("config", None) + json_summary = { + "time_taken": float(time_taken), + "trial_states": [trial.status], + "last_update": float(time.time()), + "stats": stats, + "passed": [test_passed], + "not_passed": [not test_passed], + "failures": {str(trial): 1} if not test_passed else {}, + } + with open( + os.environ.get("TEST_OUTPUT_JSON", "/tmp/learning_test.json"), + "wt", + ) as f: + json.dump(json_summary, f) + + if not test_passed: + raise ValueError( + f"`{success_metric_key}` of {success_metric_value} not reached!" + ) + return results From e528cb04f2fae2ed6a5ac35d4a82917358d16d3c Mon Sep 17 00:00:00 2001 From: Hongchao Deng Date: Thu, 30 May 2024 12:56:54 -0700 Subject: [PATCH 53/65] [core] add EC2InstanceTerminator and refactor killer creation (#45630) Signed-off-by: hongchaodeng --- python/ray/_private/test_utils.py | 62 ++++++++++++------- python/ray/tests/conftest.py | 4 +- python/ray/tests/test_chaos.py | 7 +-- .../chaos_test/test_chaos_basic.py | 2 +- release/nightly_tests/setup_chaos.py | 41 +++++++++--- 5 files changed, 78 insertions(+), 38 deletions(-) diff --git a/python/ray/_private/test_utils.py b/python/ray/_private/test_utils.py index 62e51016af2e..9e56c2b148fa 100644 --- a/python/ray/_private/test_utils.py +++ b/python/ray/_private/test_utils.py @@ -1493,8 +1493,7 @@ async def get_total_killed(self): return self.killed -@ray.remote(num_cpus=0) -class NodeKillerActor(ResourceKillerActor): +class NodeKillerBase(ResourceKillerActor): async def _find_resource_to_kill(self): node_to_kill_ip = None node_to_kill_port = None @@ -1521,6 +1520,16 @@ async def _find_resource_to_kill(self): return node_id, node_to_kill_ip, node_to_kill_port + def _get_alive_nodes(self, nodes): + alive_nodes = 0 + for node in nodes: + if node["Alive"]: + alive_nodes += 1 + return alive_nodes + + +@ray.remote(num_cpus=0) +class RayletKiller(NodeKillerBase): def _kill_resource(self, node_id, node_to_kill_ip, node_to_kill_port): if node_to_kill_port is not None: try: @@ -1533,6 +1542,33 @@ def _kill_resource(self, node_id, node_to_kill_ip, node_to_kill_port): ) self.killed.add(node_id) + def _kill_raylet(self, ip, port, graceful=False): + import grpc + from grpc._channel import _InactiveRpcError + from ray.core.generated import node_manager_pb2_grpc + + raylet_address = f"{ip}:{port}" + channel = grpc.insecure_channel(raylet_address) + stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) + try: + stub.ShutdownRaylet( + node_manager_pb2.ShutdownRayletRequest(graceful=graceful) + ) + except _InactiveRpcError: + assert not graceful + + +@ray.remote(num_cpus=0) +class EC2InstanceTerminator(NodeKillerBase): + def _kill_resource(self, node_id, node_to_kill_ip, _): + if node_to_kill_ip is not None: + try: + self._terminate_ec2_instance(node_to_kill_ip) + except Exception: + pass + logging.info(f"Terminated instance, {node_id=}, address={node_to_kill_ip}") + self.killed.add(node_id) + def _terminate_ec2_instance(self, ip): # This command uses IMDSv2 to get the host instance id and region. # After that it terminates itself using aws cli. @@ -1552,28 +1588,6 @@ def _terminate_ec2_instance(self, ip): print(f"STDOUT:\n{result.stdout}\n") print(f"STDERR:\n{result.stderr}\n") - def _kill_raylet(self, ip, port, graceful=False): - import grpc - from grpc._channel import _InactiveRpcError - from ray.core.generated import node_manager_pb2_grpc - - raylet_address = f"{ip}:{port}" - channel = grpc.insecure_channel(raylet_address) - stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) - try: - stub.ShutdownRaylet( - node_manager_pb2.ShutdownRayletRequest(graceful=graceful) - ) - except _InactiveRpcError: - assert not graceful - - def _get_alive_nodes(self, nodes): - alive_nodes = 0 - for node in nodes: - if node["Alive"]: - alive_nodes += 1 - return alive_nodes - @ray.remote(num_cpus=0) class WorkerKillerActor(ResourceKillerActor): diff --git a/python/ray/tests/conftest.py b/python/ray/tests/conftest.py index fcc70b023918..ead0ec9648ad 100644 --- a/python/ray/tests/conftest.py +++ b/python/ray/tests/conftest.py @@ -37,7 +37,7 @@ find_available_port, wait_for_condition, find_free_port, - NodeKillerActor, + RayletKiller, ) from ray.cluster_utils import AutoscalingCluster, Cluster, cluster_not_supported @@ -921,7 +921,7 @@ def _ray_start_chaos_cluster(request): assert len(nodes) == 1 if kill_interval is not None: - node_killer = get_and_run_resource_killer(NodeKillerActor, kill_interval) + node_killer = get_and_run_resource_killer(RayletKiller, kill_interval) yield cluster diff --git a/python/ray/tests/test_chaos.py b/python/ray/tests/test_chaos.py index f59308c4142b..3ec1e0cce317 100644 --- a/python/ray/tests/test_chaos.py +++ b/python/ray/tests/test_chaos.py @@ -12,7 +12,7 @@ from ray.data._internal.progress_bar import ProgressBar from ray.util.placement_group import placement_group from ray._private.test_utils import ( - NodeKillerActor, + RayletKiller, get_log_message, get_and_run_resource_killer, WorkerKillerActor, @@ -337,15 +337,14 @@ def test_node_killer_filter(autoscaler_v2): worker_nodes = [node for node in list_nodes() if not node["is_head_node"]] node_to_kill = random.choice(worker_nodes) node_killer = get_and_run_resource_killer( - NodeKillerActor, + RayletKiller, 1, max_to_kill=1, kill_filter_fn=lambda: lambda node: node["NodeID"] == node_to_kill.node_id, ) def check_killed(): - # Check that killed node is consistent across list_nodes() and - # NodeKillerActor + # Check that killed node is consistent across list_nodes() killed = list(ray.get(node_killer.get_total_killed.remote())) dead = [node.node_id for node in list_nodes() if node.state == "DEAD"] if len(killed) != 1 or len(dead) != 1: diff --git a/release/nightly_tests/chaos_test/test_chaos_basic.py b/release/nightly_tests/chaos_test/test_chaos_basic.py index 50b772caac1f..9cf120b0265b 100644 --- a/release/nightly_tests/chaos_test/test_chaos_basic.py +++ b/release/nightly_tests/chaos_test/test_chaos_basic.py @@ -206,7 +206,7 @@ def main(): # Step 3 print("Running with failures") start = time.time() - node_killer = ray.get_actor("NodeKillerActor", namespace="release_test_namespace") + node_killer = ray.get_actor("RayletKiller", namespace="release_test_namespace") node_killer.run.remote() workload(total_num_cpus, args.smoke) print(f"Runtime when there are many failures: {time.time() - start}") diff --git a/release/nightly_tests/setup_chaos.py b/release/nightly_tests/setup_chaos.py index d1f6b7fbf3d9..b88c81ed3db2 100644 --- a/release/nightly_tests/setup_chaos.py +++ b/release/nightly_tests/setup_chaos.py @@ -5,14 +5,28 @@ from ray._private.test_utils import ( get_and_run_resource_killer, - NodeKillerActor, + RayletKiller, WorkerKillerActor, + EC2InstanceTerminator, ) def parse_script_args(): parser = argparse.ArgumentParser() + + # '--kill-workers' to be deprecated in favor of '--chaos' parser.add_argument("--kill-workers", action="store_true", default=False) + + parser.add_argument( + "--chaos", + type=str, + default="", + help=( + "Chaos to inject into the test environment. " + "Options: KillRaylet, KillWorker, TerminateEC2Instance." + ), + ) + parser.add_argument("--kill-interval", type=int, default=60) parser.add_argument("--max-to-kill", type=int, default=2) parser.add_argument( @@ -77,6 +91,24 @@ def _filter_fn(node): return _task_node_filter +def get_chaos_killer(args): + if args.chaos != "": + chaos_type = args.chaos + elif args.kill_workers: + chaos_type = "KillWorker" + else: + chaos_type = "KillRaylet" # default + + if chaos_type == "KillRaylet": + return RayletKiller, task_node_filter(args.task_names) + elif chaos_type == "KillWorker": + return WorkerKillerActor, task_filter(args.task_names) + elif chaos_type == "TerminateEC2Instance": + return EC2InstanceTerminator, task_node_filter(args.task_names) + else: + raise ValueError(f"Chaos type {chaos_type} not supported.") + + def main(): """Start the chaos testing. @@ -84,12 +116,7 @@ def main(): """ args, _ = parse_script_args() ray.init(address="auto") - if args.kill_workers: - resource_killer_cls = WorkerKillerActor - kill_filter_fn = task_filter(args.task_names) - else: - resource_killer_cls = NodeKillerActor - kill_filter_fn = task_node_filter(args.task_names) + resource_killer_cls, kill_filter_fn = get_chaos_killer(args) get_and_run_resource_killer( resource_killer_cls, From ff3e39398dc2dcce35de793249e736f0e9fce57d Mon Sep 17 00:00:00 2001 From: Cindy Zhang Date: Thu, 30 May 2024 14:09:12 -0700 Subject: [PATCH 54/65] add oss tag to container tests (#45629) add oss tag to container tests Add `oss` tag to container tests. Signed-off-by: Cindy Zhang Signed-off-by: Cindy Zhang --- .buildkite/core.rayci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/core.rayci.yml b/.buildkite/core.rayci.yml index 0c0bb5921e75..cbc2b9a17259 100644 --- a/.buildkite/core.rayci.yml +++ b/.buildkite/core.rayci.yml @@ -315,6 +315,7 @@ steps: tags: - python - docker + - oss instance_type: medium commands: - bazel run //ci/ray_ci:build_in_docker -- docker --platform cpu From 18eb4337552c5e94be730f0ea8a55283fb48f641 Mon Sep 17 00:00:00 2001 From: Zhi Lin Date: Fri, 31 May 2024 05:09:31 +0800 Subject: [PATCH 55/65] [HPU] [Train] Add a Stable Diffusion fine-tuning and serving example (#45217) This PR adds an example for stable diffusion model fine-tuning and serving using HPU. Moreover, it also covers how to adapt an existing HPU example to run on Ray, so that users can use Ray to run the examples on huggingface/optimum-habana. --------- Signed-off-by: Zhi Lin Signed-off-by: Yunxuan Xiao Signed-off-by: Samuel Chan <116198444+anyscalesam@users.noreply.github.com> Co-authored-by: Yunxuan Xiao Co-authored-by: Yunxuan Xiao Co-authored-by: Samuel Chan <116198444+anyscalesam@users.noreply.github.com> Co-authored-by: Peyton Murray --- doc/source/train/examples.yml | 10 + .../train/examples/intel_gaudi/sd.ipynb | 427 ++++++++++++++++++ 2 files changed, 437 insertions(+) create mode 100644 doc/source/train/examples/intel_gaudi/sd.ipynb diff --git a/doc/source/train/examples.yml b/doc/source/train/examples.yml index 1a828ddc1714..505c757e92ae 100644 --- a/doc/source/train/examples.yml +++ b/doc/source/train/examples.yml @@ -73,6 +73,16 @@ examples: - computer vision - generative ai link: examples/pytorch/dreambooth_finetuning + - title: Finetune Stable Diffusion and generate images with Intel Gaudi + skill_level: intermediate + frameworks: + - accelerate + - transformers + use_cases: + - computer vision + - generative ai + contributor: community + link: examples/intel_gaudi/sd - title: Train a text classifier with PyTorch Lightning and Ray Data frameworks: - lightning diff --git a/doc/source/train/examples/intel_gaudi/sd.ipynb b/doc/source/train/examples/intel_gaudi/sd.ipynb new file mode 100644 index 000000000000..efe2146781ce --- /dev/null +++ b/doc/source/train/examples/intel_gaudi/sd.ipynb @@ -0,0 +1,427 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Finetune Stable Diffusion and generate images with Intel Gaudi\n", + "In this tutorial, we will go through how to finetune a stable diffusion model and generate images with Intel Gaudi(HPU). Moreover, we will show how to adapt an existing HPU example to use Ray. Once you learned how to make the adaption, you can easily access more models and optimizations that has been developed for HPU by \"Ray-ifying\" examples from [optimum-habana/examples](https://github.com/huggingface/optimum-habana/tree/main/examples) and [Model References](https://github.com/HabanaAI/Model-References)!\n", + "\n", + "Now, let's see how we can \"Ray-ify\" this [stable diffusion example](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configuration\n", + "A node with Gaudi/Gaudi2 installed is required to run this example. Both Gaudi and Gaudi2 have 8 HPUs.\n", + "\n", + "We recommend using a prebuilt container to run these examples. To run a container, you need Docker. See [Install Docker Engine](https://docs.docker.com/engine/install/) for installation instructions.\n", + "\n", + "Next, follow [Run Using Containers](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html?highlight=installer#run-using-containers) to install the Gaudi drivers and container runtime.\n", + "\n", + "Then, start the Gaudi container:\n", + "```bash\n", + "docker pull vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest\n", + "docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest\n", + "```\n", + "\n", + "Inside the container, clone [Optimum-Habana](https://github.com/huggingface/optimum-habana/) and install the dependencies:\n", + "```bash\n", + "git clone https://github.com/huggingface/optimum-habana.git\n", + "pip install ray[train,serve] optimum-habana\n", + "cd optimum-habana/\n", + "pip install -r examples/stable-diffusion/requirements.txt\n", + "pip install -r examples/stable-diffusion/training/requirements.txt\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fine-tuning with Textual Inversion\n", + "First, let's start with fine-tuning. Check `examples/stable-diffusion/training/textual_inversion.py` which fine-tunes a Stable Diffusion model on HPU. You can follow [this document](https://github.com/huggingface/optimum-habana/blob/main/examples/stable-diffusion/training/README.md#textual-inversion) and try it once without using Ray.\n", + "\n", + "In order to run this script on Ray, we need to make some changes. But don't worry, it's actually pretty simple. Basically, we just need to identify the main training loop, and run it in TorchTrainer.\n", + "\n", + "First, check this block at the end of the file:\n", + "```python\n", + "if __name__ == \"__main__\":\n", + " main()\n", + "```\n", + "\n", + "Originally, this script will be started by MPI if multiple workers are used. But with Ray, we should setup TorchTrainer and supply a main function, which is `main()` in this example.\n", + "\n", + "Therefore, it becomes straight-forward to make these changes:\n", + "```python\n", + "if __name__ == \"__main__\":\n", + " import ray\n", + " from ray import train\n", + " from ray.train import ScalingConfig, Checkpoint, CheckpointConfig, RunConfig\n", + " from ray.train.torch import TorchTrainer, TorchConfig\n", + "\n", + " ray.init(address=\"auto\")\n", + "\n", + " # Configure computation resources\n", + " # In ScalingConfig, require an HPU for each worker\n", + " scaling_config = ScalingConfig(num_workers=1, resources_per_worker={\"CPU\": 1, \"HPU\": 1})\n", + " # Set backend to hccl in TorchConfig\n", + " torch_config = TorchConfig(backend = \"hccl\")\n", + " # Initialize a Ray TorchTrainer\n", + " trainer = TorchTrainer(\n", + " train_loop_per_worker=main,\n", + " torch_config=torch_config,\n", + " scaling_config=scaling_config,\n", + " )\n", + "\n", + " result = trainer.fit()\n", + "```\n", + "\n", + "Before we try to run, we need to inspect the `main` function to see if it can work in such way. As we skim through the function, it's clear that it does not take any input parameters, but it calls `parse_args` to get all configurations. Originally, these configurations are set in command line by MPI. But because we switch to Ray to start the workers, command line arguments are no longer accessible. Therefore, `parse_args` should be called in the main program and passed to `main` function.\n", + "\n", + "Apart from this, no other changes are necessary. By inserting the following code, you can now run the script on Ray." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Replace lines below:\n", + "# def main():\n", + "# args = parse_args()\n", + "# with these lines:\n", + "def main(config):\n", + " args = config[\"args\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Replace lines below:\n", + "# if __name__ == \"__main__\":\n", + "# main()\n", + "# with these lines:\n", + "if __name__ == \"__main__\":\n", + " import ray\n", + " from ray import train\n", + " from ray.train import ScalingConfig, Checkpoint, CheckpointConfig, RunConfig\n", + " from ray.train.torch import TorchTrainer, TorchConfig\n", + "\n", + " ray.init(address=\"auto\")\n", + "\n", + " # Configure computation resources\n", + " # In ScalingConfig, require an HPU for each worker\n", + " scaling_config = ScalingConfig(num_workers=1, resources_per_worker={\"CPU\": 1, \"HPU\": 1})\n", + " # Set backend to hccl in TorchConfig\n", + " torch_config = TorchConfig(backend = \"hccl\")\n", + " # Initialize a Ray TorchTrainer\n", + " trainer = TorchTrainer(\n", + " train_loop_per_worker=main,\n", + "\t\ttrain_loop_config={\"args\": parse_args()},\n", + " torch_config=torch_config,\n", + " scaling_config=scaling_config,\n", + " )\n", + "\n", + " result = trainer.fit()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One last thing: remember to use absolute path in the command line arguments. The reason is similar to why we move `parse_args` out, Ray's workers do not share the current working directory. Now, you can run the fine-tuning of Stable Diffusion on Ray!\n", + "An example command:\n", + "```bash\n", + "python /root/optimum-habana/examples/stable-diffusion/training/textual_inversion.py \\\n", + " --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 \\\n", + " --train_data_dir \"/root/cat\" \\\n", + " --learnable_property object \\\n", + " --placeholder_token \"\" \\\n", + " --initializer_token toy \\\n", + " --resolution 512 \\\n", + " --train_batch_size 4 \\\n", + " --max_train_steps 3000 \\\n", + " --learning_rate 5.0e-04 \\\n", + " --scale_lr \\\n", + " --lr_scheduler constant \\\n", + " --lr_warmup_steps 0 \\\n", + " --output_dir /tmp/textual_inversion_cat \\\n", + " --save_as_full_pipeline \\\n", + " --gaudi_config_name Habana/stable-diffusion \\\n", + " --throughput_warmup_steps 3\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The output of the example command:\n", + "```bash\n", + "Training started with configuration:\n", + "╭───────────────────────────────────────────────╮\n", + "│ Training config │\n", + "├───────────────────────────────────────────────┤\n", + "│ train_loop_config/args ...t_warmup_steps=3) │\n", + "╰───────────────────────────────────────────────╯\n", + "(RayTrainWorker pid=15683) Setting up process group for: env:// [rank=0, world_size=1]\n", + "(TorchTrainer pid=15530) Started distributed worker processes: \n", + "(TorchTrainer pid=15530) - (ip=172.17.0.2, pid=15683) world_rank=0, local_rank=0, node_rank=0\n", + "(RayTrainWorker pid=15683) [WARNING|utils.py:185] 2024-05-09 05:21:13,961 >> optimum-habana v1.10.4 has been validated for SynapseAI v1.14.0 but habana-frameworks v1.15.1.15 was found, this could lead to undefined behavior!\n", + "(RayTrainWorker pid=15683) [WARNING|utils.py:198] 2024-05-09 05:21:15,401 >> optimum-habana v1.10.4 has been validated for SynapseAI v1.14.0 but the driver version is v1.15.0, this could lead to undefined behavior!\n", + "(RayTrainWorker pid=15683) /usr/local/lib/python3.10/dist-packages/diffusers/utils/outputs.py:63: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", + "(RayTrainWorker pid=15683) torch.utils._pytree._register_pytree_node(\n", + "(RayTrainWorker pid=15683) /usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + "(RayTrainWorker pid=15683) warnings.warn(\n", + "(RayTrainWorker pid=15683) 05/09/2024 05:21:15 - INFO - __main__ - Distributed environment: MULTI_HPU Backend: hccl\n", + "(RayTrainWorker pid=15683) Num processes: 1\n", + "(RayTrainWorker pid=15683) Process index: 0\n", + "(RayTrainWorker pid=15683) Local process index: 0\n", + "(RayTrainWorker pid=15683) Device: hpu\n", + "(RayTrainWorker pid=15683) \n", + "(RayTrainWorker pid=15683) Mixed precision type: bf16\n", + "(RayTrainWorker pid=15683) \n", + "(RayTrainWorker pid=15683) {'timestep_spacing', 'rescale_betas_zero_snr', 'dynamic_thresholding_ratio', 'prediction_type', 'thresholding', 'sample_max_value', 'clip_sample_range'} was not found in config. Values will be initialized to default values.\n", + "(RayTrainWorker pid=15683) ============================= HABANA PT BRIDGE CONFIGURATION =========================== \n", + "(RayTrainWorker pid=15683) PT_HPU_LAZY_MODE = 1\n", + "(RayTrainWorker pid=15683) PT_RECIPE_CACHE_PATH = \n", + "(RayTrainWorker pid=15683) PT_CACHE_FOLDER_DELETE = 0\n", + "(RayTrainWorker pid=15683) PT_HPU_RECIPE_CACHE_CONFIG = \n", + "(RayTrainWorker pid=15683) PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807\n", + "(RayTrainWorker pid=15683) PT_HPU_LAZY_ACC_PAR_MODE = 0\n", + "(RayTrainWorker pid=15683) PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0\n", + "(RayTrainWorker pid=15683) ---------------------------: System Configuration :---------------------------\n", + "(RayTrainWorker pid=15683) Num CPU Cores : 152\n", + "(RayTrainWorker pid=15683) CPU RAM : 1056440348 KB\n", + "(RayTrainWorker pid=15683) ------------------------------------------------------------------------------\n", + "(RayTrainWorker pid=15683) {'scaling_factor', 'force_upcast'} was not found in config. Values will be initialized to default values.\n", + "(RayTrainWorker pid=15683) {'addition_embed_type', 'resnet_out_scale_factor', 'transformer_layers_per_block', 'attention_type', 'conv_out_kernel', 'time_embedding_type', 'addition_embed_type_num_heads', 'dropout', 'only_cross_attention', 'projection_class_embeddings_input_dim', 'num_attention_heads', 'upcast_attention', 'reverse_transformer_layers_per_block', 'resnet_time_scale_shift', 'resnet_skip_time_act', 'time_embedding_act_fn', 'class_embeddings_concat', 'time_cond_proj_dim', 'num_class_embeds', 'mid_block_type', 'cross_attention_norm', 'addition_time_embed_dim', 'dual_cross_attention', 'mid_block_only_cross_attention', 'encoder_hid_dim_type', 'time_embedding_dim', 'class_embed_type', 'conv_in_kernel', 'timestep_post_act', 'encoder_hid_dim', 'use_linear_projection'} was not found in config. Values will be initialized to default values.\n", + "(RayTrainWorker pid=15683) 05/09/2024 05:21:20 - INFO - __main__ - ***** Running training *****\n", + "(RayTrainWorker pid=15683) 05/09/2024 05:21:20 - INFO - __main__ - Num examples = 600\n", + "(RayTrainWorker pid=15683) 05/09/2024 05:21:20 - INFO - __main__ - Num Epochs = 20\n", + "(RayTrainWorker pid=15683) 05/09/2024 05:21:20 - INFO - __main__ - Instantaneous batch size per device = 4\n", + "(RayTrainWorker pid=15683) 05/09/2024 05:21:20 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 4\n", + "(RayTrainWorker pid=15683) 05/09/2024 05:21:20 - INFO - __main__ - Gradient Accumulation steps = 1\n", + "(RayTrainWorker pid=15683) 05/09/2024 05:21:20 - INFO - __main__ - Total optimization steps = 3000\n", + "Steps: 0%| | 0/3000 [00:00> Running on CPU.\n", + "(RayTrainWorker pid=15683) Configuration saved in /tmp/textual_inversion_cat/vae/config.json\n", + "(RayTrainWorker pid=15683) Model weights saved in /tmp/textual_inversion_cat/vae/diffusion_pytorch_model.safetensors\n", + "(RayTrainWorker pid=15683) Configuration saved in /tmp/textual_inversion_cat/unet/config.json\n", + "(RayTrainWorker pid=15683) Model weights saved in /tmp/textual_inversion_cat/unet/diffusion_pytorch_model.safetensors\n", + "(RayTrainWorker pid=15683) Configuration saved in /tmp/textual_inversion_cat/scheduler/scheduler_config.json\n", + "(RayTrainWorker pid=15683) Configuration saved in /tmp/textual_inversion_cat/model_index.json\n", + "(RayTrainWorker pid=15683) 05/09/2024 05:40:31 - INFO - __main__ - Saving embeddings\n", + "Steps: 100%|██████████| 3000/3000 [19:10<00:00, 2.61it/s, loss=0.0261, lr=0.002]\n", + "\n", + "Training completed after 0 iterations at 2024-05-09 05:40:33. Total running time: 19min 30s\n", + "2024-05-09 05:40:33,116\tINFO tune.py:1007 -- Wrote the latest version of all result files and experiment state to '/root/ray_results/TorchTrainer_2024-05-09_05-21-02' in 0.0022s.\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One of the advantages of Ray is that it scales easily. In this example, we can easily scale the training to multiple workers by changing `num_workers` in `ScalingConfig`. Torch distributed environment will be automatically initialized in Ray." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Serve the fine-tuned model on Ray\n", + "Now that we have fine-tuned a Stable Diffusion model, we can serve it for image generation. The code below loads the fine-tuned model and generates an image." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from optimum.habana.diffusers import GaudiStableDiffusionPipeline\n", + "model_id = \"/tmp/textual_inversion_cat/\"\n", + "pipe = GaudiStableDiffusionPipeline.from_pretrained(\n", + " model_id,\n", + " torch_dtype=torch.bfloat16,\n", + " use_habana=True,\n", + " use_hpu_graphs=True,\n", + " gaudi_config=\"Habana/stable-diffusion\",\n", + ")\n", + "prompt = \"a is dancing on the grass.\"\n", + "image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]\n", + "image.save(\"cat-backpack.png\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can easily use Ray Serve to deploy it as an HTTP service. The code below is modified from this [example](https://docs.ray.io/en/master/serve/tutorials/stable-diffusion.html). Save it to `gaudi_sd_deploy.py`, and use `serve run gaudi_sd_deploy:entrypoint` to start the Serve application." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from optimum.habana.diffusers import GaudiStableDiffusionPipeline\n", + "from io import BytesIO\n", + "from fastapi import FastAPI\n", + "from fastapi.responses import Response\n", + "\n", + "from ray import serve\n", + "from ray.serve.handle import DeploymentHandle\n", + "\n", + "\n", + "app = FastAPI()\n", + "\n", + "\n", + "@serve.deployment(num_replicas=1)\n", + "@serve.ingress(app)\n", + "class APIIngress:\n", + " def __init__(self, diffusion_model_handle: DeploymentHandle) -> None:\n", + " self.handle = diffusion_model_handle\n", + "\n", + " @app.get(\n", + " \"/imagine\",\n", + " responses={200: {\"content\": {\"image/png\": {}}}},\n", + " response_class=Response,\n", + " )\n", + " async def generate(self, prompt: str, img_size: int = 512):\n", + " assert len(prompt), \"prompt parameter cannot be empty\"\n", + "\n", + " image = await self.handle.generate.remote(prompt, img_size=img_size)\n", + " file_stream = BytesIO()\n", + " image.save(file_stream, \"PNG\")\n", + " return Response(content=file_stream.getvalue(), media_type=\"image/png\")\n", + "\n", + "\n", + "@serve.deployment(\n", + " ray_actor_options={\"resources\": {\"HPU\": 1}}\n", + ")\n", + "class GaudiStableDiffusion:\n", + " def __init__(self, model_id):\n", + " self.pipe = GaudiStableDiffusionPipeline.from_pretrained(\n", + " model_id,\n", + " torch_dtype=torch.bfloat16,\n", + " use_habana=True,\n", + " use_hpu_graphs=True,\n", + " gaudi_config=\"Habana/stable-diffusion\",\n", + " )\n", + "\n", + " def generate(self, prompt: str, img_size: int = 512):\n", + " assert len(prompt), \"prompt parameter cannot be empty\"\n", + "\n", + " image = self.pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]\n", + "\t\treturn image\n", + "\n", + "\n", + "entrypoint = APIIngress.bind(GaudiStableDiffusion.bind(\"/tmp/textual_inversion_cat/\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After you successfully deployed this Serve application, run the code below to generate an image." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "\n", + "prompt = \"a is dancing on the grass.\"\n", + "input = \"%20\".join(prompt.split(\" \"))\n", + "resp = requests.get(f\"http://127.0.0.1:8000/imagine?prompt={input}\")\n", + "with open(\"output.png\", 'wb') as f:\n", + " f.write(resp.content)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here is an example image:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from IPython.display import Image\n", + "Image(filename='ouput.png')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + }, + "orphan": true + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 3240167967e5b05f97ac51fc82f527eed6ee4afe Mon Sep 17 00:00:00 2001 From: "Yang, Bo" Date: Thu, 30 May 2024 16:11:59 -0700 Subject: [PATCH 56/65] [Core] Handle `TypeError` when `RayTaskError.cause` is a `BaseExceptionGroup` (#45523) Signed-off-by: Yang, Bo --- python/ray/exceptions.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/ray/exceptions.py b/python/ray/exceptions.py index daec24914f02..e35eee260d95 100644 --- a/python/ray/exceptions.py +++ b/python/ray/exceptions.py @@ -187,6 +187,7 @@ def as_instanceof_cause(self): try: dual_cls = self.make_dual_exception_type() + return dual_cls(self.cause) except TypeError as e: logger.warning( f"User exception type {type(self.cause)} in RayTaskError can't" @@ -196,8 +197,6 @@ def as_instanceof_cause(self): ) return self - return dual_cls(self.cause) - def __str__(self): """Format a RayTaskError as a string.""" lines = self.traceback_str.strip().split("\n") From b73e03768b0fca4bf566e8155e1fb767a84f9a7b Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Thu, 30 May 2024 22:56:59 -0700 Subject: [PATCH 57/65] [Core] Fix worker column off-by-one in dashboard (#45648) Signed-off-by: Rui Qiao --- dashboard/client/src/pages/node/NodeRow.tsx | 1 + 1 file changed, 1 insertion(+) diff --git a/dashboard/client/src/pages/node/NodeRow.tsx b/dashboard/client/src/pages/node/NodeRow.tsx index bab7238a954f..3c5b89adb36d 100644 --- a/dashboard/client/src/pages/node/NodeRow.tsx +++ b/dashboard/client/src/pages/node/NodeRow.tsx @@ -279,6 +279,7 @@ export const WorkerRow = ({ node, worker }: WorkerRowProps) => { + N/A {coreWorker && ( From d6f97cc754a09f82a36da4795fd8280abd7d18b4 Mon Sep 17 00:00:00 2001 From: simonsays1980 Date: Fri, 31 May 2024 10:36:44 +0200 Subject: [PATCH 58/65] [RLlib] Fix bug: Target nets are not synched with main nets in SAC. (#45614) --- rllib/algorithms/sac/sac_rl_module.py | 7 ------- .../algorithms/sac/torch/sac_torch_rl_module.py | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/rllib/algorithms/sac/sac_rl_module.py b/rllib/algorithms/sac/sac_rl_module.py index 23527c957e59..3acd1e30a4c0 100644 --- a/rllib/algorithms/sac/sac_rl_module.py +++ b/rllib/algorithms/sac/sac_rl_module.py @@ -102,13 +102,6 @@ def setup(self): self.qf_twin = catalog.build_qf_head(framework=self.framework) self.qf_target_twin = catalog.build_qf_head(framework=self.framework) - # We do not want to train the target network. - self.qf_target_encoder.trainable = False - self.qf_target.trainable = False - if self.twin_q: - self.qf_target_twin_encoder.trainable = False - self.qf_target_twin.trainable = False - # Get the action distribution class. self.action_dist_cls = catalog.get_action_dist_cls(framework=self.framework) diff --git a/rllib/algorithms/sac/torch/sac_torch_rl_module.py b/rllib/algorithms/sac/torch/sac_torch_rl_module.py index a83303b8b948..9b30e5bbaf89 100644 --- a/rllib/algorithms/sac/torch/sac_torch_rl_module.py +++ b/rllib/algorithms/sac/torch/sac_torch_rl_module.py @@ -33,6 +33,22 @@ def setup(self): # parameter names to be removed or renamed when syncing from the state dict # when synching. if not self.inference_only: + # We do not want to train the target networks. Instead, we sync them + # with the actual (trained) ones. + self.qf_target_encoder.requires_grad_(False) + self.qf_target_encoder.load_state_dict(self.qf_encoder.state_dict()) + self.qf_target.requires_grad_(False) + self.qf_target.load_state_dict(self.qf.state_dict()) + + # If necessary, also synchronize the twin networks. + if self.twin_q: + self.qf_target_twin_encoder.requires_grad_(False) + self.qf_target_twin_encoder.load_state_dict( + self.qf_twin_encoder.state_dict() + ) + self.qf_target_twin.requires_grad_(False) + self.qf_target_twin.load_state_dict(self.qf_twin.state_dict()) + # Set the expected and unexpected keys for the inference-only module. self._set_inference_only_state_dict_keys() From f9ab43954e0f6ee5ab40db26e3d7bcf47a92a0a6 Mon Sep 17 00:00:00 2001 From: Jialing He Date: Sat, 1 Jun 2024 00:18:37 +0800 Subject: [PATCH 59/65] [Core] Fix the GIL deadlock issue caused by `list_named_actors`. (#45582) Signed-off-by: hejialing.hjl --- python/ray/_raylet.pyx | 5 +++-- python/ray/tests/test_list_actors_4.py | 31 ++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 8437edaa0122..407db534b013 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -4446,8 +4446,9 @@ cdef class CoreWorker: cdef: pair[c_vector[pair[c_string, c_string]], CRayStatus] result_pair - result_pair = CCoreWorkerProcess.GetCoreWorker().ListNamedActors( - all_namespaces) + with nogil: + result_pair = CCoreWorkerProcess.GetCoreWorker().ListNamedActors( + all_namespaces) check_status(result_pair.second) return [ (namespace.decode("utf-8"), diff --git a/python/ray/tests/test_list_actors_4.py b/python/ray/tests/test_list_actors_4.py index 0f302e428084..4bbbc819338d 100644 --- a/python/ray/tests/test_list_actors_4.py +++ b/python/ray/tests/test_list_actors_4.py @@ -1,5 +1,7 @@ +import asyncio import pytest import sys +import time import ray from ray._private.test_utils import run_string_as_driver @@ -52,6 +54,35 @@ class A: assert not ray.util.list_named_actors(all_namespaces=True) +@pytest.mark.asyncio +async def test_list_named_actors_with_normal_task(shutdown_only): + # The following parameters are all designed to increase the + # probability of reproducing the situation where + # `list_named_actors` gets hang. + # https://github.com/ray-project/ray/issues/45581 for more details. + TEST_RANGE = 10 + NORMAL_TASK_PER_ITEM = 100 + LIST_NAMED_ACTORS_PER_ITEM = 10 + for _ in range(TEST_RANGE): + time.sleep(1) + + @ray.remote + def test(): + return True + + res = [] + for i in range(NORMAL_TASK_PER_ITEM): + res.append(test.remote()) + + async def run(): + for i in range(LIST_NAMED_ACTORS_PER_ITEM): + await asyncio.sleep(0) + ray.util.list_named_actors(True) + + res.append(run()) + await asyncio.gather(*res) + + if __name__ == "__main__": import os From a95ec7fad786a3dfac995ccb126398f3f046e8df Mon Sep 17 00:00:00 2001 From: Sven Mika Date: Fri, 31 May 2024 18:24:27 +0200 Subject: [PATCH 60/65] [RLlib] DreamerV3 on tf: Fix bug w/ `reduce_fn` still passed into `LearnerGroup.update_from_batch()`. (#45419) --- rllib/algorithms/dqn/dqn.py | 19 +- rllib/algorithms/dreamerv3/dreamerv3.py | 222 +++++++++------ .../algorithms/dreamerv3/dreamerv3_learner.py | 56 +--- .../dreamerv3/dreamerv3_rl_module.py | 86 ++++-- .../dreamerv3/tests/test_dreamerv3.py | 126 ++++----- .../dreamerv3/tf/dreamerv3_tf_learner.py | 33 ++- .../dreamerv3/tf/dreamerv3_tf_rl_module.py | 40 +-- .../dreamerv3/tf/models/actor_network.py | 8 +- .../tf/models/components/cnn_atari.py | 2 +- .../models/components/continue_predictor.py | 4 +- .../tf/models/components/reward_predictor.py | 4 +- .../tf/models/components/sequence_model.py | 5 +- .../tf/models/components/vector_decoder.py | 4 +- .../dreamerv3/tf/models/critic_network.py | 4 +- .../dreamerv3/tf/models/dreamer_model.py | 1 + rllib/algorithms/dreamerv3/utils/__init__.py | 2 +- .../algorithms/dreamerv3/utils/env_runner.py | 266 ++++++++++++------ rllib/algorithms/dreamerv3/utils/summaries.py | 253 ++++++++++------- rllib/algorithms/impala/impala.py | 7 + rllib/algorithms/ppo/ppo.py | 9 +- .../ppo/tests/test_ppo_with_env_runner.py | 8 +- .../ppo/tests/test_ppo_with_rl_module.py | 8 +- rllib/core/learner/learner.py | 121 ++++---- rllib/core/learner/learner_group.py | 38 ++- rllib/core/learner/torch/torch_learner.py | 2 +- rllib/core/models/torch/heads.py | 2 +- rllib/core/models/torch/primitives.py | 25 +- rllib/core/models/torch/utils.py | 23 +- rllib/core/rl_module/torch/torch_rl_module.py | 3 +- .../examples/catalogs/mobilenet_v2_encoder.py | 5 +- .../examples/evaluation/custom_evaluation.py | 1 + rllib/execution/rollout_ops.py | 9 +- rllib/tuned_examples/dreamerv3/atari_100k.py | 50 ++-- rllib/tuned_examples/dreamerv3/atari_200M.py | 53 ++-- .../dreamerv3/dm_control_suite_vision.py | 41 ++- rllib/utils/metrics/__init__.py | 1 + rllib/utils/metrics/metrics_logger.py | 30 +- rllib/utils/metrics/stats.py | 40 ++- 38 files changed, 951 insertions(+), 660 deletions(-) diff --git a/rllib/algorithms/dqn/dqn.py b/rllib/algorithms/dqn/dqn.py index 2674f99dbbd5..3bf51e0edd42 100644 --- a/rllib/algorithms/dqn/dqn.py +++ b/rllib/algorithms/dqn/dqn.py @@ -629,6 +629,11 @@ def _training_step_new_api_stack(self, *, with_noise_reset) -> ResultDict: env_runner_results, key=ENV_RUNNER_RESULTS ) + self.metrics.log_dict( + self.metrics.peek(ENV_RUNNER_RESULTS, NUM_AGENT_STEPS_SAMPLED, default={}), + key=NUM_AGENT_STEPS_SAMPLED_LIFETIME, + reduce="sum", + ) self.metrics.log_value( NUM_ENV_STEPS_SAMPLED_LIFETIME, self.metrics.peek(ENV_RUNNER_RESULTS, NUM_ENV_STEPS_SAMPLED, default=0), @@ -639,11 +644,6 @@ def _training_step_new_api_stack(self, *, with_noise_reset) -> ResultDict: self.metrics.peek(ENV_RUNNER_RESULTS, NUM_EPISODES, default=0), reduce="sum", ) - self.metrics.log_dict( - self.metrics.peek(ENV_RUNNER_RESULTS, NUM_AGENT_STEPS_SAMPLED, default={}), - key=NUM_AGENT_STEPS_SAMPLED_LIFETIME, - reduce="sum", - ) self.metrics.log_dict( self.metrics.peek(ENV_RUNNER_RESULTS, NUM_MODULE_STEPS_SAMPLED, default={}), key=NUM_MODULE_STEPS_SAMPLED_LIFETIME, @@ -680,6 +680,14 @@ def _training_step_new_api_stack(self, *, with_noise_reset) -> ResultDict: with self.metrics.log_time((TIMERS, LEARNER_UPDATE_TIMER)): learner_results = self.learner_group.update_from_episodes( episodes=episodes, + timesteps={ + NUM_ENV_STEPS_SAMPLED_LIFETIME: ( + self.metrics.peek(NUM_ENV_STEPS_SAMPLED_LIFETIME) + ), + NUM_AGENT_STEPS_SAMPLED_LIFETIME: ( + self.metrics.peek(NUM_AGENT_STEPS_SAMPLED_LIFETIME) + ), + }, ) # Isolate TD-errors from result dicts (we should not log these to # disk or WandB, they might be very large). @@ -730,6 +738,7 @@ def _training_step_new_api_stack(self, *, with_noise_reset) -> ResultDict: # Update the target networks, if necessary. with self.metrics.log_time((TIMERS, LEARNER_ADDITIONAL_UPDATE_TIMER)): modules_to_update = set(learner_results[0].keys()) - {ALL_MODULES} + # TODO (sven): Move to Learner._after_gradient_based_update(). additional_results = self.learner_group.additional_update( module_ids_to_update=modules_to_update, timestep=current_ts, diff --git a/rllib/algorithms/dreamerv3/dreamerv3.py b/rllib/algorithms/dreamerv3/dreamerv3.py index abb163f8b671..50bcce11ad90 100644 --- a/rllib/algorithms/dreamerv3/dreamerv3.py +++ b/rllib/algorithms/dreamerv3/dreamerv3.py @@ -10,11 +10,9 @@ import gc import logging -import tree # pip install dm_tree -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, Optional, Union import gymnasium as gym -import numpy as np from ray.rllib.algorithms.algorithm import Algorithm from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided @@ -22,30 +20,36 @@ from ray.rllib.algorithms.dreamerv3.utils import do_symlog_obs from ray.rllib.algorithms.dreamerv3.utils.env_runner import DreamerV3EnvRunner from ray.rllib.algorithms.dreamerv3.utils.summaries import ( + report_dreamed_eval_trajectory_vs_samples, report_predicted_vs_sampled_obs, report_sampling_and_replay_buffer, ) from ray.rllib.core import DEFAULT_MODULE_ID from ray.rllib.core.columns import Columns from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec +from ray.rllib.execution.rollout_ops import synchronous_parallel_sample from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils import deep_update from ray.rllib.utils.annotations import override, PublicAPI from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.numpy import one_hot from ray.rllib.utils.metrics import ( - ALL_MODULES, + ENV_RUNNER_RESULTS, GARBAGE_COLLECTION_TIMER, LEARN_ON_BATCH_TIMER, + LEARNER_RESULTS, NUM_AGENT_STEPS_SAMPLED, - NUM_AGENT_STEPS_TRAINED, + NUM_AGENT_STEPS_SAMPLED_LIFETIME, NUM_ENV_STEPS_SAMPLED, - NUM_ENV_STEPS_TRAINED, + NUM_ENV_STEPS_SAMPLED_LIFETIME, + NUM_ENV_STEPS_TRAINED_LIFETIME, + NUM_EPISODES, + NUM_EPISODES_LIFETIME, NUM_GRAD_UPDATES_LIFETIME, NUM_SYNCH_WORKER_WEIGHTS, - NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS, SAMPLE_TIMER, SYNCH_WORKER_WEIGHTS_TIMER, + TIMERS, ) from ray.rllib.utils.replay_buffers.episode_replay_buffer import EpisodeReplayBuffer from ray.rllib.utils.typing import LearningRateOrSchedule, ResultDict @@ -145,10 +149,6 @@ def __init__(self, algo_class=None): self.env_runner_cls = DreamerV3EnvRunner self.num_env_runners = 0 self.rollout_fragment_length = 1 - # Since we are using a gymnasium-based EnvRunner, we can utilitze its - # vectorization capabilities w/o suffering performance losses (as we would - # with RLlib's `RemoteVectorEnv`). - self.remote_worker_envs = True # Dreamer only runs on the new API stack. self.enable_rl_module_and_learner = True self.enable_env_runner_and_connector_v2 = True @@ -506,8 +506,6 @@ def setup(self, config: AlgorithmConfig): @override(Algorithm) def training_step(self) -> ResultDict: - results = {} - env_runner = self.workers.local_worker() # Push enough samples into buffer initially before we start training. @@ -520,7 +518,7 @@ def training_step(self) -> ResultDict: # Have we sampled yet in this `training_step()` call? have_sampled = False - with self._timers[SAMPLE_TIMER]: + with self.metrics.log_time((TIMERS, SAMPLE_TIMER)): # Continue sampling from the actual environment (and add collected samples # to our replay buffer) as long as we: while ( @@ -535,45 +533,76 @@ def training_step(self) -> ResultDict: or not have_sampled ): # Sample using the env runner's module. - done_episodes, ongoing_episodes = env_runner.sample() + episodes, env_runner_results = synchronous_parallel_sample( + worker_set=self.workers, + max_agent_steps=( + self.config.rollout_fragment_length + * self.config.num_envs_per_env_runner + ), + sample_timeout_s=self.config.sample_timeout_s, + _uses_new_env_runners=True, + _return_metrics=True, + ) + self.metrics.merge_and_log_n_dicts( + env_runner_results, key=ENV_RUNNER_RESULTS + ) # Add ongoing and finished episodes into buffer. The buffer will # automatically take care of properly concatenating (by episode IDs) # the different chunks of the same episodes, even if they come in via # separate `add()` calls. - self.replay_buffer.add(episodes=done_episodes + ongoing_episodes) + self.replay_buffer.add(episodes=episodes) have_sampled = True # We took B x T env steps. - env_steps_last_regular_sample = sum( - len(eps) for eps in done_episodes + ongoing_episodes - ) + env_steps_last_regular_sample = sum(len(eps) for eps in episodes) total_sampled = env_steps_last_regular_sample # If we have never sampled before (just started the algo and not # recovered from a checkpoint), sample B random actions first. - if self._counters[NUM_AGENT_STEPS_SAMPLED] == 0: - d_, o_ = env_runner.sample( - num_timesteps=( + if self.metrics.peek(NUM_ENV_STEPS_SAMPLED_LIFETIME, default=0) == 0: + _episodes, _env_runner_results = synchronous_parallel_sample( + worker_set=self.workers, + max_agent_steps=( self.config.batch_size_B * self.config.batch_length_T - ) - - env_steps_last_regular_sample, + - env_steps_last_regular_sample + ), + sample_timeout_s=self.config.sample_timeout_s, random_actions=True, + _uses_new_env_runners=True, + _return_metrics=True, ) - self.replay_buffer.add(episodes=d_ + o_) - total_sampled += sum(len(eps) for eps in d_ + o_) - - self._counters[NUM_AGENT_STEPS_SAMPLED] += total_sampled - self._counters[NUM_ENV_STEPS_SAMPLED] += total_sampled + self.metrics.merge_and_log_n_dicts( + _env_runner_results, key=ENV_RUNNER_RESULTS + ) + self.replay_buffer.add(episodes=_episodes) + total_sampled += sum(len(eps) for eps in _episodes) + + # Update lifetime counts (now that we gathered results from all + # EnvRunners). + self.metrics.log_dict( + { + NUM_AGENT_STEPS_SAMPLED_LIFETIME: self.metrics.peek( + ENV_RUNNER_RESULTS, NUM_AGENT_STEPS_SAMPLED + ), + NUM_ENV_STEPS_SAMPLED_LIFETIME: self.metrics.peek( + ENV_RUNNER_RESULTS, NUM_ENV_STEPS_SAMPLED + ), + NUM_EPISODES_LIFETIME: self.metrics.peek( + ENV_RUNNER_RESULTS, NUM_EPISODES + ), + }, + reduce="sum", + ) # Summarize environment interaction and buffer data. - results[ALL_MODULES] = report_sampling_and_replay_buffer( - replay_buffer=self.replay_buffer, + report_sampling_and_replay_buffer( + metrics=self.metrics, replay_buffer=self.replay_buffer ) # Continue sampling batch_size_B x batch_length_T sized batches from the buffer - # and using these to update our models (`LearnerGroup.update()`) until the - # computed `training_ratio` is larger than the configured one, meaning we should - # go back and collect more samples again from the actual environment. + # and using these to update our models (`LearnerGroup.update_from_batch()`) + # until the computed `training_ratio` is larger than the configured one, meaning + # we should go back and collect more samples again from the actual environment. # However, when calculating the `training_ratio` here, we use only the # trained steps in this very `training_step()` call over the most recent sample # amount (`env_steps_last_regular_sample`), not the global values. This is to @@ -584,7 +613,7 @@ def training_step(self) -> ResultDict: replayed_steps_this_iter / env_steps_last_regular_sample ) < self.config.training_ratio: # Time individual batch updates. - with self._timers[LEARN_ON_BATCH_TIMER]: + with self.metrics.log_time((TIMERS, LEARN_ON_BATCH_TIMER)): logger.info(f"\tSub-iteration {self.training_iteration}/{sub_iter})") # Draw a new sample from the replay buffer. @@ -603,65 +632,76 @@ def training_step(self) -> ResultDict: ) # Perform the actual update via our learner group. - train_results = self.learner_group.update_from_batch( + learner_results = self.learner_group.update_from_batch( batch=SampleBatch(sample).as_multi_agent(), - reduce_fn=self._reduce_results, + # TODO(sven): Maybe we should do this broadcase of global timesteps + # at the end, like for EnvRunner global env step counts. Maybe when + # we request the state from the Learners, we can - at the same + # time - send the current globally summed/reduced-timesteps. + timesteps={ + NUM_ENV_STEPS_SAMPLED_LIFETIME: self.metrics.peek( + NUM_ENV_STEPS_SAMPLED_LIFETIME, default=0 + ) + }, ) - self._counters[NUM_AGENT_STEPS_TRAINED] += replayed_steps - self._counters[NUM_ENV_STEPS_TRAINED] += replayed_steps - - # Perform additional (non-gradient updates), such as the critic EMA-copy - # update. - with self._timers["critic_ema_update"]: - self.learner_group.additional_update( - timestep=self._counters[NUM_ENV_STEPS_SAMPLED], - reduce_fn=self._reduce_results, - ) - - if self.config.report_images_and_videos: - report_predicted_vs_sampled_obs( - # TODO (sven): DreamerV3 is single-agent only. - results=train_results[DEFAULT_MODULE_ID], - sample=sample, - batch_size_B=self.config.batch_size_B, - batch_length_T=self.config.batch_length_T, - symlog_obs=do_symlog_obs( - env_runner.env.single_observation_space, - self.config.symlog_obs, - ), - ) - - res = train_results[DEFAULT_MODULE_ID] - logger.info( - f"\t\tWORLD_MODEL_L_total={res['WORLD_MODEL_L_total']:.5f} (" - f"L_pred={res['WORLD_MODEL_L_prediction']:.5f} (" - f"decoder/obs={res['WORLD_MODEL_L_decoder']} " - f"L_rew={res['WORLD_MODEL_L_reward']} " - f"L_cont={res['WORLD_MODEL_L_continue']}); " - f"L_dyn/rep={res['WORLD_MODEL_L_dynamics']:.5f})" + self.metrics.merge_and_log_n_dicts(learner_results, key=LEARNER_RESULTS) + self.metrics.log_value( + NUM_ENV_STEPS_TRAINED_LIFETIME, replayed_steps, reduce="sum" ) - msg = "\t\t" - if self.config.train_actor: - msg += f"L_actor={res['ACTOR_L_total']:.5f} " - if self.config.train_critic: - msg += f"L_critic={res['CRITIC_L_total']:.5f} " - logger.info(msg) sub_iter += 1 - self._counters[NUM_GRAD_UPDATES_LIFETIME] += 1 + self.metrics.log_value(NUM_GRAD_UPDATES_LIFETIME, 1, reduce="sum") + + # Log videos showing how the decoder produces observation predictions + # from the posterior states. + # Only every n iterations and only for the first sampled batch row + # (videos are `config.batch_length_T` frames long). + report_predicted_vs_sampled_obs( + # TODO (sven): DreamerV3 is single-agent only. + metrics=self.metrics, + sample=sample, + batch_size_B=self.config.batch_size_B, + batch_length_T=self.config.batch_length_T, + symlog_obs=do_symlog_obs( + env_runner.env.single_observation_space, + self.config.symlog_obs, + ), + do_report=( + self.config.report_images_and_videos + and self.training_iteration % 100 == 0 + ), + ) + + # Log videos showing some of the dreamed trajectories and compare them with the + # actual trajectories from the train batch. + # Only every n iterations and only for the first sampled batch row AND first ts. + # (videos are `config.horizon_H` frames long originating from the observation + # at B=0 and T=0 in the train batch). + report_dreamed_eval_trajectory_vs_samples( + metrics=self.metrics, + sample=sample, + burn_in_T=0, + dreamed_T=self.config.horizon_H + 1, + dreamer_model=self.workers.local_worker().module.dreamer_model, + symlog_obs=do_symlog_obs( + env_runner.env.single_observation_space, + self.config.symlog_obs, + ), + do_report=( + self.config.report_dream_data and self.training_iteration % 100 == 0 + ), + ) # Update weights - after learning on the LearnerGroup - on all EnvRunner # workers. - with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]: + with self.metrics.log_time((TIMERS, SYNCH_WORKER_WEIGHTS_TIMER)): # Only necessary if RLModule is not shared between (local) EnvRunner and # (local) Learner. if not self.config.share_module_between_env_runner_and_learner: - self._counters[ - NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS - ] = 0 - self._counters[NUM_SYNCH_WORKER_WEIGHTS] += 1 + self.metrics.log_value(NUM_SYNCH_WORKER_WEIGHTS, 1, reduce="sum") self.workers.sync_weights( - from_worker_or_learner_group=self.learner_group + from_worker_or_learner_group=self.learner_group, + inference_only=True, ) # Try trick from https://medium.com/dive-into-ml-ai/dealing-with-memory-leak- @@ -669,33 +709,29 @@ def training_step(self) -> ResultDict: if self.config.gc_frequency_train_steps and ( self.training_iteration % self.config.gc_frequency_train_steps == 0 ): - with self._timers[GARBAGE_COLLECTION_TIMER]: + with self.metrics.log_time((TIMERS, GARBAGE_COLLECTION_TIMER)): gc.collect() # Add train results and the actual training ratio to stats. The latter should # be close to the configured `training_ratio`. - results.update(train_results) - results[ALL_MODULES]["actual_training_ratio"] = self.training_ratio + self.metrics.log_value("actual_training_ratio", self.training_ratio, window=1) # Return all results. - return results + return self.metrics.reduce() @property def training_ratio(self) -> float: - """Returns the actual training ratio of this Algorithm. + """Returns the actual training ratio of this Algorithm (not the configured one). The training ratio is copmuted by dividing the total number of steps trained thus far (replayed from the buffer) over the total number of actual env steps taken thus far. """ - return self._counters[NUM_ENV_STEPS_TRAINED] / ( - self._counters[NUM_ENV_STEPS_SAMPLED] + eps = 0.0001 + return self.metrics.peek(NUM_ENV_STEPS_TRAINED_LIFETIME, default=0) / ( + (self.metrics.peek(NUM_ENV_STEPS_SAMPLED_LIFETIME, default=eps) or eps) ) - @staticmethod - def _reduce_results(results: List[Dict[str, Any]]): - return tree.map_structure(lambda *s: np.mean(s, axis=0), *results) - # TODO (sven): Remove this once DreamerV3 is on the new SingleAgentEnvRunner. @PublicAPI def __setstate__(self, state) -> None: diff --git a/rllib/algorithms/dreamerv3/dreamerv3_learner.py b/rllib/algorithms/dreamerv3/dreamerv3_learner.py index 2ee6f4b16187..684829e3f194 100644 --- a/rllib/algorithms/dreamerv3/dreamerv3_learner.py +++ b/rllib/algorithms/dreamerv3/dreamerv3_learner.py @@ -7,61 +7,25 @@ D. Hafner, T. Lillicrap, M. Norouzi, J. Ba https://arxiv.org/pdf/2010.02193.pdf """ -from typing import Any, DefaultDict, Dict - -from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config from ray.rllib.core.learner.learner import Learner -from ray.rllib.policy.sample_batch import MultiAgentBatch -from ray.rllib.utils.annotations import override -from ray.rllib.utils.typing import ModuleID, TensorType +from ray.rllib.utils.annotations import ( + override, + OverrideToImplementCustomLogic_CallToSuperRecommended, +) class DreamerV3Learner(Learner): """DreamerV3 specific Learner class. - Only implements the `additional_update_for_module()` method to define the logic + Only implements the `_after_gradient_based_update()` method to define the logic for updating the critic EMA-copy after each training step. """ + @OverrideToImplementCustomLogic_CallToSuperRecommended @override(Learner) - def compile_results( - self, - *, - batch: MultiAgentBatch, - fwd_out: Dict[str, Any], - loss_per_module: Dict[str, TensorType], - metrics_per_module: DefaultDict[ModuleID, Dict[str, Any]], - ) -> Dict[str, Any]: - results = super().compile_results( - batch=batch, - fwd_out=fwd_out, - loss_per_module=loss_per_module, - metrics_per_module=metrics_per_module, - ) - - # Add the predicted obs distributions for possible (video) summarization. - if self.config.report_images_and_videos: - for module_id, res in results.items(): - if module_id in fwd_out: - res["WORLD_MODEL_fwd_out_obs_distribution_means_BxT"] = fwd_out[ - module_id - ]["obs_distribution_means_BxT"] - return results - - @override(Learner) - def additional_update_for_module( - self, - *, - module_id: ModuleID, - config: DreamerV3Config, - timestep: int, - ) -> None: - """Updates the EMA weights of the critic network.""" - - # Call the base class' method. - super().additional_update_for_module( - module_id=module_id, config=config, timestep=timestep - ) + def _after_gradient_based_update(self, timesteps): + super()._after_gradient_based_update(timesteps) # Update EMA weights of the critic. - self.module[module_id].critic.update_ema() + for module_id, module in self.module._rl_modules.items(): + module.critic.update_ema() diff --git a/rllib/algorithms/dreamerv3/dreamerv3_rl_module.py b/rllib/algorithms/dreamerv3/dreamerv3_rl_module.py index 9d5bf2605529..c95363eaa907 100644 --- a/rllib/algorithms/dreamerv3/dreamerv3_rl_module.py +++ b/rllib/algorithms/dreamerv3/dreamerv3_rl_module.py @@ -3,6 +3,7 @@ """ import abc +from typing import Any, Dict import gymnasium as gym import numpy as np @@ -29,6 +30,8 @@ class DreamerV3RLModule(RLModule, abc.ABC): @override(RLModule) def setup(self): + super().setup() + # Gather model-relevant settings. B = 1 T = self.config.model_config_dict["batch_length_T"] @@ -79,35 +82,39 @@ def setup(self): self.action_dist_cls = catalog.get_action_dist_cls(framework=self.framework) # Perform a test `call()` to force building the dreamer model's variables. - test_obs = np.tile( - np.expand_dims(self.config.observation_space.sample(), (0, 1)), - reps=(B, T) + (1,) * len(self.config.observation_space.shape), - ) - if isinstance(self.config.action_space, gym.spaces.Discrete): - test_actions = np.tile( - np.expand_dims( - one_hot( - self.config.action_space.sample(), - depth=self.config.action_space.n, + if self.framework == "tf2": + test_obs = np.tile( + np.expand_dims(self.config.observation_space.sample(), (0, 1)), + reps=(B, T) + (1,) * len(self.config.observation_space.shape), + ) + if isinstance(self.config.action_space, gym.spaces.Discrete): + test_actions = np.tile( + np.expand_dims( + one_hot( + self.config.action_space.sample(), + depth=self.config.action_space.n, + ), + (0, 1), ), - (0, 1), + reps=(B, T, 1), + ) + else: + test_actions = np.tile( + np.expand_dims(self.config.action_space.sample(), (0, 1)), + reps=(B, T, 1), + ) + + self.dreamer_model( + inputs=None, + observations=_convert_to_tf(test_obs, dtype=tf.float32), + actions=_convert_to_tf(test_actions, dtype=tf.float32), + is_first=_convert_to_tf(np.ones((B, T)), dtype=tf.bool), + start_is_terminated_BxT=_convert_to_tf( + np.zeros((B * T,)), dtype=tf.bool ), - reps=(B, T, 1), - ) - else: - test_actions = np.tile( - np.expand_dims(self.config.action_space.sample(), (0, 1)), - reps=(B, T, 1), + gamma=gamma, ) - self.dreamer_model( - None, - _convert_to_tf(test_obs, dtype=tf.float32), - _convert_to_tf(test_actions, dtype=tf.float32), - _convert_to_tf(np.ones((B, T)), dtype=tf.bool), - _convert_to_tf(np.zeros((B * T,)), dtype=tf.bool), - ) - # Initialize the critic EMA net: self.critic.init_ema() @@ -152,3 +159,32 @@ def output_specs_train(self) -> SpecDict: # Deterministic, continuous h-states (t1 to T). "h_states_BxT", ] + + @override(RLModule) + def _forward_inference(self, batch: NestedDict) -> Dict[str, Any]: + # Call the Dreamer-Model's forward_inference method and return a dict. + actions, next_state = self.dreamer_model.forward_inference( + observations=batch[Columns.OBS], + previous_states=batch[Columns.STATE_IN], + is_first=batch["is_first"], + ) + return {Columns.ACTIONS: actions, Columns.STATE_OUT: next_state} + + @override(RLModule) + def _forward_exploration(self, batch: NestedDict) -> Dict[str, Any]: + # Call the Dreamer-Model's forward_exploration method and return a dict. + actions, next_state = self.dreamer_model.forward_exploration( + observations=batch[Columns.OBS], + previous_states=batch[Columns.STATE_IN], + is_first=batch["is_first"], + ) + return {Columns.ACTIONS: actions, Columns.STATE_OUT: next_state} + + @override(RLModule) + def _forward_train(self, batch: NestedDict): + # Call the Dreamer-Model's forward_train method and return its outputs as-is. + return self.dreamer_model.forward_train( + observations=batch[Columns.OBS], + actions=batch[Columns.ACTIONS], + is_first=batch["is_first"], + ) diff --git a/rllib/algorithms/dreamerv3/tests/test_dreamerv3.py b/rllib/algorithms/dreamerv3/tests/test_dreamerv3.py index 92bb33dda483..0c8875b54f10 100644 --- a/rllib/algorithms/dreamerv3/tests/test_dreamerv3.py +++ b/rllib/algorithms/dreamerv3/tests/test_dreamerv3.py @@ -40,6 +40,7 @@ def test_dreamerv3_compilation(self): # Build a DreamerV3Config object. config = ( dreamerv3.DreamerV3Config() + .framework(eager_tracing=False) .training( # Keep things simple. Especially the long dream rollouts seem # to take an enormous amount of time (initially). @@ -51,7 +52,7 @@ def test_dreamerv3_compilation(self): use_float16=False, ) .learners( - num_learners=2, # Try with 2 Learners. + num_learners=0, # TODO 2 # Try with 2 Learners. num_cpus_per_learner=1, num_gpus_per_learner=0, ) @@ -59,70 +60,69 @@ def test_dreamerv3_compilation(self): num_iterations = 2 - for _ in framework_iterator(config, frameworks="tf2"): - for env in [ - "FrozenLake-v1", - "CartPole-v1", - "ALE/MsPacman-v5", - "Pendulum-v1", - ]: - print("Env={}".format(env)) - # Add one-hot observations for FrozenLake env. - if env == "FrozenLake-v1": - - def env_creator(ctx): - import gymnasium as gym - from ray.rllib.algorithms.dreamerv3.utils.env_runner import ( - OneHot, - ) - - return OneHot(gym.make("FrozenLake-v1")) - - tune.register_env("frozen-lake-one-hot", env_creator) - env = "frozen-lake-one-hot" - - config.environment(env) - algo = config.build() - obs_space = algo.workers.local_worker().env.single_observation_space - act_space = algo.workers.local_worker().env.single_action_space - rl_module = algo.workers.local_worker().module - - for i in range(num_iterations): - results = algo.train() - print(results) - # Test dream trajectory w/ recreated observations. - sample = algo.replay_buffer.sample() - dream = rl_module.dreamer_model.dream_trajectory_with_burn_in( - start_states=rl_module.dreamer_model.get_initial_state(), - timesteps_burn_in=5, - timesteps_H=45, - observations=sample["obs"][:1], # B=1 - actions=( - one_hot( - sample["actions"], - depth=act_space.n, - ) - if isinstance(act_space, gym.spaces.Discrete) - else sample["actions"] - )[ - :1 - ], # B=1 - ) - self.assertTrue( - dream["actions_dreamed_t0_to_H_BxT"].shape - == (46, 1) - + ( - (act_space.n,) - if isinstance(act_space, gym.spaces.Discrete) - else tuple(act_space.shape) + for env in [ + "FrozenLake-v1", + "CartPole-v1", + "ALE/MsPacman-v5", + "Pendulum-v1", + ]: + print("Env={}".format(env)) + # Add one-hot observations for FrozenLake env. + if env == "FrozenLake-v1": + + def env_creator(ctx): + import gymnasium as gym + from ray.rllib.algorithms.dreamerv3.utils.env_runner import ( + OneHot, ) + + return OneHot(gym.make("FrozenLake-v1")) + + tune.register_env("frozen-lake-one-hot", env_creator) + env = "frozen-lake-one-hot" + + config.environment(env) + algo = config.build() + obs_space = algo.workers.local_worker().env.single_observation_space + act_space = algo.workers.local_worker().env.single_action_space + rl_module = algo.workers.local_worker().module + + for i in range(num_iterations): + results = algo.train() + print(results) + # Test dream trajectory w/ recreated observations. + sample = algo.replay_buffer.sample() + dream = rl_module.dreamer_model.dream_trajectory_with_burn_in( + start_states=rl_module.dreamer_model.get_initial_state(), + timesteps_burn_in=5, + timesteps_H=45, + observations=sample["obs"][:1], # B=1 + actions=( + one_hot( + sample["actions"], + depth=act_space.n, + ) + if isinstance(act_space, gym.spaces.Discrete) + else sample["actions"] + )[ + :1 + ], # B=1 + ) + self.assertTrue( + dream["actions_dreamed_t0_to_H_BxT"].shape + == (46, 1) + + ( + (act_space.n,) + if isinstance(act_space, gym.spaces.Discrete) + else tuple(act_space.shape) ) - self.assertTrue(dream["continues_dreamed_t0_to_H_BxT"].shape == (46, 1)) - self.assertTrue( - dream["observations_dreamed_t0_to_H_BxT"].shape - == [46, 1] + list(obs_space.shape) - ) - algo.stop() + ) + self.assertTrue(dream["continues_dreamed_t0_to_H_BxT"].shape == (46, 1)) + self.assertTrue( + dream["observations_dreamed_t0_to_H_BxT"].shape + == [46, 1] + list(obs_space.shape) + ) + algo.stop() def test_dreamerv3_dreamer_model_sizes(self): """Tests, whether the different model sizes match the ones reported in [1].""" diff --git a/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_learner.py b/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_learner.py index 2fa6dce2f02d..d35717e4aa44 100644 --- a/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_learner.py +++ b/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_learner.py @@ -18,7 +18,6 @@ from ray.rllib.core.learner.learner import ParamDict from ray.rllib.core.learner.tf.tf_learner import TfLearner from ray.rllib.utils.annotations import override -from ray.rllib.utils.deprecation import deprecation_warning from ray.rllib.utils.framework import try_import_tf, try_import_tfp from ray.rllib.utils.tf_utils import symlog, two_hot, clip_gradients from ray.rllib.utils.typing import ModuleID, TensorType @@ -40,7 +39,7 @@ class DreamerV3TfLearner(DreamerV3Learner, TfLearner): @override(TfLearner) def configure_optimizers_for_module( - self, module_id: ModuleID, config: DreamerV3Config = None, hps=None + self, module_id: ModuleID, config: DreamerV3Config = None ): """Create the 3 optimizers for Dreamer learning: world_model, actor, critic. @@ -48,12 +47,6 @@ def configure_optimizers_for_module( - albeit probably not that important - are used by the author's own implementation. """ - if hps is not None: - deprecation_warning( - old="Learner.configure_optimizers_for_module(.., hps=..)", - help="Deprecated argument. Use `config` (AlgorithmConfig) instead.", - error=True, - ) dreamerv3_module = self._module[module_id] @@ -242,10 +235,20 @@ def compute_loss_for_module( key=module_id, window=1, # <- single items (should not be mean/ema-reduced over time). ) + + # Add the predicted obs distributions for possible (video) summarization. + if config.report_images_and_videos: + self.metrics.log_value( + (module_id, "WORLD_MODEL_fwd_out_obs_distribution_means_b0xT"), + fwd_out["obs_distribution_means_BxT"][: self.config.batch_length_T], + reduce=None, # No reduction, we want the tensor to stay in-tact. + window=1, # <- single items (should not be mean/ema-reduced over time). + ) + if config.report_individual_batch_item_stats: # Log important world-model loss stats. self.metrics.log_dict( - metrics_dict={ + { "WORLD_MODEL_L_decoder_B_T": prediction_losses["L_decoder_B_T"], "WORLD_MODEL_L_reward_B_T": prediction_losses["L_reward_B_T"], "WORLD_MODEL_L_continue_B_T": prediction_losses["L_continue_B_T"], @@ -270,10 +273,10 @@ def compute_loss_for_module( "h": fwd_out["h_states_BxT"], "z": fwd_out["z_posterior_states_BxT"], }, - start_is_terminated=tf.reshape(batch["is_terminated"], [-1]), # ->BxT + start_is_terminated=tf.reshape(batch["is_terminated"], [-1]), # -> BxT ) if config.report_dream_data: - # To reduce this massive mount of data a little, slice out a T=1 piece + # To reduce this massive amount of data a little, slice out a T=1 piece # from each stats that has the shape (H, BxT), meaning convert e.g. # `rewards_dreamed_t0_to_H_BxT` into `rewards_dreamed_t0_to_H_Bx1`. # This will reduce the amount of data to be transferred and reported @@ -281,9 +284,9 @@ def compute_loss_for_module( self.metrics.log_dict( { # Replace 'T' with '1'. - "DREAM_DATA_" - + key[:-1] - + "1": (value[:, config.batch_size_B_per_learner]) + f"DREAM_DATA_{key[:-1]}1": ( + value[:, config.batch_size_B_per_learner] + ) for key, value in dream_data.items() if key.endswith("H_BxT") }, @@ -733,7 +736,7 @@ def _compute_critic_loss( :-1 ] - # Reduce over H- (time) axis (sum) and then B-axis (mean). + # Reduce over both H- (time) axis and B-axis (mean). L_critic = tf.reduce_mean(L_critic_H_B) # Log important critic loss stats. diff --git a/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_rl_module.py b/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_rl_module.py index a05c516d29bd..44952e829741 100644 --- a/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_rl_module.py +++ b/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_rl_module.py @@ -7,17 +7,8 @@ D. Hafner, T. Lillicrap, M. Norouzi, J. Ba https://arxiv.org/pdf/2010.02193.pdf """ -from typing import Any, Dict - from ray.rllib.algorithms.dreamerv3.dreamerv3_rl_module import DreamerV3RLModule -from ray.rllib.core.columns import Columns -from ray.rllib.core.rl_module.rl_module import RLModule from ray.rllib.core.rl_module.tf.tf_rl_module import TfRLModule -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.nested_dict import NestedDict - -tf1, tf, _ = try_import_tf() class DreamerV3TfRLModule(TfRLModule, DreamerV3RLModule): @@ -26,33 +17,4 @@ class DreamerV3TfRLModule(TfRLModule, DreamerV3RLModule): Serves mainly as a thin-wrapper around the `DreamerModel` (a tf.keras.Model) class. """ - framework: str = "tf2" - - @override(RLModule) - def _forward_inference(self, batch: NestedDict) -> Dict[str, Any]: - # Call the Dreamer-Model's forward_inference method and return a dict. - actions, next_state = self.dreamer_model.forward_inference( - observations=batch[Columns.OBS], - previous_states=batch[Columns.STATE_IN], - is_first=batch["is_first"], - ) - return {Columns.ACTIONS: actions, Columns.STATE_OUT: next_state} - - @override(RLModule) - def _forward_exploration(self, batch: NestedDict) -> Dict[str, Any]: - # Call the Dreamer-Model's forward_exploration method and return a dict. - actions, next_state = self.dreamer_model.forward_exploration( - observations=batch[Columns.OBS], - previous_states=batch[Columns.STATE_IN], - is_first=batch["is_first"], - ) - return {Columns.ACTIONS: actions, Columns.STATE_OUT: next_state} - - @override(RLModule) - def _forward_train(self, batch: NestedDict): - # Call the Dreamer-Model's forward_train method and return its outputs as-is. - return self.dreamer_model.forward_train( - observations=batch[Columns.OBS], - actions=batch[Columns.ACTIONS], - is_first=batch["is_first"], - ) + framework = "tf2" diff --git a/rllib/algorithms/dreamerv3/tf/models/actor_network.py b/rllib/algorithms/dreamerv3/tf/models/actor_network.py index 44785323711f..6fe1a7ef5b71 100644 --- a/rllib/algorithms/dreamerv3/tf/models/actor_network.py +++ b/rllib/algorithms/dreamerv3/tf/models/actor_network.py @@ -3,8 +3,6 @@ D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap https://arxiv.org/pdf/2301.04104v1.pdf """ -from typing import Optional - import gymnasium as gym from gymnasium.spaces import Box, Discrete import numpy as np @@ -35,15 +33,15 @@ class ActorNetwork(tf.keras.Model): def __init__( self, *, - model_size: Optional[str] = "XS", + model_size: str = "XS", action_space: gym.Space, ): """Initializes an ActorNetwork instance. Args: - model_size: The "Model Size" used according to [1] Appendinx B. + model_size: The "Model Size" used according to [1] Appendix B. Use None for manually setting the different network sizes. - action_space: The action space the our environment used. + action_space: The action space the our environment used. """ super().__init__(name="actor") diff --git a/rllib/algorithms/dreamerv3/tf/models/components/cnn_atari.py b/rllib/algorithms/dreamerv3/tf/models/components/cnn_atari.py index 16e733ce4b17..c0f7ee09b092 100644 --- a/rllib/algorithms/dreamerv3/tf/models/components/cnn_atari.py +++ b/rllib/algorithms/dreamerv3/tf/models/components/cnn_atari.py @@ -23,7 +23,7 @@ def __init__( """Initializes a CNNAtari instance. Args: - model_size: The "Model Size" used according to [1] Appendinx B. + model_size: The "Model Size" used according to [1] Appendix B. Use None for manually setting the `cnn_multiplier`. cnn_multiplier: Optional override for the additional factor used to multiply the number of filters with each CNN layer. Starting with diff --git a/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py b/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py index dd948b9951f0..d5434d8aca31 100644 --- a/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py +++ b/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py @@ -3,8 +3,6 @@ D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap https://arxiv.org/pdf/2301.04104v1.pdf """ -from typing import Optional - from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP from ray.rllib.algorithms.dreamerv3.utils import ( get_gru_units, @@ -29,7 +27,7 @@ class ContinuePredictor(tf.keras.Model): terminal. """ - def __init__(self, *, model_size: Optional[str] = "XS"): + def __init__(self, *, model_size: str = "XS"): """Initializes a ContinuePredictor instance. Args: diff --git a/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor.py b/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor.py index c281565897cb..3e7cb6de93f9 100644 --- a/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor.py +++ b/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor.py @@ -3,8 +3,6 @@ D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap https://arxiv.org/pdf/2301.04104v1.pdf """ -from typing import Optional - from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP from ray.rllib.algorithms.dreamerv3.tf.models.components.reward_predictor_layer import ( RewardPredictorLayer, @@ -28,7 +26,7 @@ class RewardPredictor(tf.keras.Model): def __init__( self, *, - model_size: Optional[str] = "XS", + model_size: str = "XS", num_buckets: int = 255, lower_bound: float = -20.0, upper_bound: float = 20.0, diff --git a/rllib/algorithms/dreamerv3/tf/models/components/sequence_model.py b/rllib/algorithms/dreamerv3/tf/models/components/sequence_model.py index 0d2de1970471..7e21c9860578 100644 --- a/rllib/algorithms/dreamerv3/tf/models/components/sequence_model.py +++ b/rllib/algorithms/dreamerv3/tf/models/components/sequence_model.py @@ -76,7 +76,6 @@ def __init__( num_gru_units, return_sequences=False, return_state=False, - time_major=True, # Note: Changing these activations is most likely a bad idea! # In experiments, setting one of both of them to silu deteriorated # performance significantly. @@ -139,7 +138,7 @@ def call(self, a, h, z): ) # Pass through pre-GRU layer. out = self.pre_gru_layer(out) - # Pass through (time-major) GRU. - h_next = self.gru_unit(tf.expand_dims(out, axis=0), initial_state=h) + # Pass through (batch-major) GRU (expand axis=1 as the time axis). + h_next = self.gru_unit(tf.expand_dims(out, axis=1), initial_state=h) # Return the GRU's output (the next h-state). return h_next diff --git a/rllib/algorithms/dreamerv3/tf/models/components/vector_decoder.py b/rllib/algorithms/dreamerv3/tf/models/components/vector_decoder.py index a384d1473bda..e183561f9217 100644 --- a/rllib/algorithms/dreamerv3/tf/models/components/vector_decoder.py +++ b/rllib/algorithms/dreamerv3/tf/models/components/vector_decoder.py @@ -3,8 +3,6 @@ D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap https://arxiv.org/pdf/2301.04104v1.pdf """ -from typing import Optional - import gymnasium as gym from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP @@ -28,7 +26,7 @@ class VectorDecoder(tf.keras.Model): def __init__( self, *, - model_size: Optional[str] = "XS", + model_size: str = "XS", observation_space: gym.Space, ): """Initializes a VectorDecoder instance. diff --git a/rllib/algorithms/dreamerv3/tf/models/critic_network.py b/rllib/algorithms/dreamerv3/tf/models/critic_network.py index e2b2d45d9435..4eb9b9940133 100644 --- a/rllib/algorithms/dreamerv3/tf/models/critic_network.py +++ b/rllib/algorithms/dreamerv3/tf/models/critic_network.py @@ -3,8 +3,6 @@ D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap https://arxiv.org/pdf/2301.04104v1.pdf """ -from typing import Optional - from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP from ray.rllib.algorithms.dreamerv3.tf.models.components.reward_predictor_layer import ( RewardPredictorLayer, @@ -33,7 +31,7 @@ class CriticNetwork(tf.keras.Model): def __init__( self, *, - model_size: Optional[str] = "XS", + model_size: str = "XS", num_buckets: int = 255, lower_bound: float = -20.0, upper_bound: float = 20.0, diff --git a/rllib/algorithms/dreamerv3/tf/models/dreamer_model.py b/rllib/algorithms/dreamerv3/tf/models/dreamer_model.py index 26a44d1fb3b0..dc4eec8579ae 100644 --- a/rllib/algorithms/dreamerv3/tf/models/dreamer_model.py +++ b/rllib/algorithms/dreamerv3/tf/models/dreamer_model.py @@ -113,6 +113,7 @@ def call( actions, is_first, start_is_terminated_BxT, + gamma, ): """Main call method for building this model in order to generate its variables. diff --git a/rllib/algorithms/dreamerv3/utils/__init__.py b/rllib/algorithms/dreamerv3/utils/__init__.py index 592bbf9b32e8..fe7b58cf515e 100644 --- a/rllib/algorithms/dreamerv3/utils/__init__.py +++ b/rllib/algorithms/dreamerv3/utils/__init__.py @@ -124,7 +124,7 @@ def get_num_curiosity_nets(model_size, override=None): num_curiosity_nets = { "nano": 8, "micro": 8, - "mini": 16, + "mini": 8, "XXS": 8, "XS": 8, "S": 8, diff --git a/rllib/algorithms/dreamerv3/utils/env_runner.py b/rllib/algorithms/dreamerv3/utils/env_runner.py index c0b73ef824fd..93ed2beb6240 100644 --- a/rllib/algorithms/dreamerv3/utils/env_runner.py +++ b/rllib/algorithms/dreamerv3/utils/env_runner.py @@ -15,21 +15,42 @@ import numpy as np import tree # pip install dm_tree +import ray from ray.rllib.algorithms.algorithm_config import AlgorithmConfig -from ray.rllib.core import DEFAULT_MODULE_ID +from ray.rllib.core import DEFAULT_AGENT_ID, DEFAULT_MODULE_ID from ray.rllib.core.columns import Columns from ray.rllib.env.env_runner import EnvRunner +from ray.rllib.env.single_agent_episode import SingleAgentEpisode from ray.rllib.env.wrappers.atari_wrappers import NoopResetEnv, MaxAndSkipEnv from ray.rllib.env.wrappers.dm_control_wrapper import DMCEnv from ray.rllib.env.utils import _gym_env_creator -from ray.rllib.evaluation.metrics import RolloutMetrics from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.env.single_agent_episode import SingleAgentEpisode -from ray.rllib.utils.numpy import one_hot +from ray.rllib.utils.framework import try_import_tf, try_import_torch +from ray.rllib.utils.metrics import ( + EPISODE_DURATION_SEC_MEAN, + EPISODE_LEN_MAX, + EPISODE_LEN_MEAN, + EPISODE_LEN_MIN, + EPISODE_RETURN_MAX, + EPISODE_RETURN_MEAN, + EPISODE_RETURN_MIN, + NUM_AGENT_STEPS_SAMPLED, + NUM_AGENT_STEPS_SAMPLED_LIFETIME, + NUM_EPISODES, + NUM_ENV_STEPS_SAMPLED, + NUM_ENV_STEPS_SAMPLED_LIFETIME, + NUM_MODULE_STEPS_SAMPLED, + NUM_MODULE_STEPS_SAMPLED_LIFETIME, +) +from ray.rllib.utils.metrics.metrics_logger import MetricsLogger +from ray.rllib.utils.numpy import convert_to_numpy, one_hot +from ray.rllib.utils.spaces.space_utils import batch, unbatch +from ray.rllib.utils.torch_utils import convert_to_torch_tensor +from ray.rllib.utils.typing import ResultDict from ray.tune.registry import ENV_CREATOR, _global_registry _, tf, _ = try_import_tf() +torch, _ = try_import_torch() # TODO (sven): Use SingleAgentEnvRunner instead of this as soon as we have the new @@ -70,23 +91,38 @@ def __init__( # However, in Danijar's repo, Atari100k experiments are configured as: # noop=30, 64x64x3 (no grayscaling), sticky actions=False, # full action space=False, - wrappers = [ - partial(gym.wrappers.TimeLimit, max_episode_steps=108000), - partial(resize_v1, x_size=64, y_size=64), # resize to 64x64 - NormalizedImageEnv, - NoopResetEnv, - MaxAndSkipEnv, - ] + + def _entry_point(): + return gym.make( + self.config.env, + **dict( + self.config.env_config, + **{ + # "sticky actions" but not according to Danijar's 100k + # configs. + "repeat_action_probability": 0.0, + # "full action space" but not according to Danijar's 100k + # configs. + "full_action_space": False, + # Already done by MaxAndSkip wrapper: "action repeat" == 4. + "frameskip": 1, + }, + ), + ) + + gym.register("rllib-single-agent-env-v0", entry_point=_entry_point) self.env = gym.vector.make( - "GymV26Environment-v0", - env_id=self.config.env, - wrappers=wrappers, + "rllib-single-agent-env-v0", num_envs=self.config.num_envs_per_env_runner, asynchronous=self.config.remote_worker_envs, - make_kwargs=dict( - self.config.env_config, **{"render_mode": "rgb_array"} - ), + wrappers=[ + partial(gym.wrappers.TimeLimit, max_episode_steps=108000), + partial(resize_v1, x_size=64, y_size=64), # resize to 64x64 + NormalizedImageEnv, + NoopResetEnv, + MaxAndSkipEnv, + ], ) # DeepMind Control. elif self.config.env.startswith("DMC/"): @@ -147,6 +183,24 @@ def __init__( # TODO (sven): DreamerV3 is currently single-agent only. self.module = self.marl_module_spec.build()[DEFAULT_MODULE_ID] + self.metrics = MetricsLogger() + + self._device = None + if ( + torch + and torch.cuda.is_available() + and self.config.framework_str == "torch" + and self.config.share_module_between_env_runner_and_learner + and self.config.num_gpus_per_learner > 0 + ): + gpu_ids = ray.get_gpu_ids() + self._device = f"cuda:{gpu_ids[0]}" + self.convert_to_tensor = ( + partial(convert_to_torch_tensor, device=self._device) + if self.config.framework_str == "torch" + else tf.convert_to_tensor + ) + self._needs_initial_reset = True self._episodes = [None for _ in range(self.num_envs)] self._states = [None for _ in range(self.num_envs)] @@ -158,7 +212,6 @@ def __init__( # via its replay buffer, etc..). self._done_episodes_for_metrics = [] self._ongoing_episodes_for_metrics = defaultdict(list) - self._ts_since_last_metrics = 0 @override(EnvRunner) def sample( @@ -228,7 +281,7 @@ def _sample_timesteps( explore: bool = True, random_actions: bool = False, force_reset: bool = False, - ) -> Tuple[List[SingleAgentEpisode], List[SingleAgentEpisode]]: + ) -> List[SingleAgentEpisode]: """Helper method to run n timesteps. See docstring of self.sample() for more details. @@ -238,47 +291,25 @@ def _sample_timesteps( # Get initial states for all `batch_size_B` rows in the forward batch. initial_states = tree.map_structure( lambda s: np.repeat(s, self.num_envs, axis=0), - self.module.get_initial_state(), + convert_to_numpy(self.module.get_initial_state()), ) # Have to reset the env (on all vector sub-envs). if force_reset or self._needs_initial_reset: obs, _ = self.env.reset() + self._needs_initial_reset = False self._episodes = [SingleAgentEpisode() for _ in range(self.num_envs)] - states = initial_states - # Set is_first to True for all rows (all sub-envs just got reset). - is_first = np.ones((self.num_envs,)) - self._needs_initial_reset = False # Set initial obs and states in the episodes. for i in range(self.num_envs): self._episodes[i].add_env_reset(observation=obs[i]) - self._states[i] = {k: s[i] for k, s in states.items()} + self._states[i] = None + # Don't reset existing envs; continue in already started episodes. else: # Pick up stored observations and states from previous timesteps. obs = np.stack([eps.observations[-1] for eps in self._episodes]) - # Compile the initial state for each batch row: If episode just started, use - # model's initial state, if not, use state stored last in - # SingleAgentEpisode. - states = { - k: np.stack( - [ - initial_states[k][i] - if self._states[i] is None - else self._states[i][k] - for i, eps in enumerate(self._episodes) - ] - ) - for k in initial_states.keys() - } - # If a batch row is at the beginning of an episode, set its `is_first` flag - # to 1.0, otherwise 0.0. - is_first = np.zeros((self.num_envs,)) - for i, eps in enumerate(self._episodes): - if len(eps) == 0: - is_first[i] = 1.0 # Loop through env for n timesteps. ts = 0 @@ -288,33 +319,35 @@ def _sample_timesteps( actions = self.env.action_space.sample() # Compute an action using our RLModule. else: - batch = { + is_first = np.zeros((self.num_envs,)) + for i, eps in enumerate(self._episodes): + if self._states[i] is None: + is_first[i] = 1.0 + self._states[i] = {k: s[i] for k, s in initial_states.items()} + to_module = { Columns.STATE_IN: tree.map_structure( - lambda s: tf.convert_to_tensor(s), states + lambda s: self.convert_to_tensor(s), batch(self._states) ), - Columns.OBS: tf.convert_to_tensor(obs), - "is_first": tf.convert_to_tensor(is_first), + Columns.OBS: self.convert_to_tensor(obs), + "is_first": self.convert_to_tensor(is_first), } # Explore or not. if explore: - outs = self.module.forward_exploration(batch) + outs = self.module.forward_exploration(to_module) else: - outs = self.module.forward_inference(batch) + outs = self.module.forward_inference(to_module) # Model outputs one-hot actions (if discrete). Convert to int actions # as well. - actions = outs[Columns.ACTIONS].numpy() + actions = convert_to_numpy(outs[Columns.ACTIONS]) if isinstance(self.env.single_action_space, gym.spaces.Discrete): actions = np.argmax(actions, axis=-1) - states = tree.map_structure( - lambda s: s.numpy(), outs[Columns.STATE_OUT] - ) + self._states = unbatch(convert_to_numpy(outs[Columns.STATE_OUT])) obs, rewards, terminateds, truncateds, infos = self.env.step(actions) ts += self.num_envs for i in range(self.num_envs): - s = {k: s[i] for k, s in states.items()} # The last entry in self.observations[i] is already the reset # obs of the new episode. if terminateds[i] or truncateds[i]: @@ -327,12 +360,7 @@ def _sample_timesteps( terminated=terminateds[i], truncated=truncateds[i], ) - self._states[i] = s - # Reset h-states to the model's initial ones b/c we are starting a - # new episode. - for k, v in self.module.get_initial_state().items(): - states[k][i] = v.numpy() - is_first[i] = True + self._states[i] = None done_episodes_to_return.append(self._episodes[i]) # Create a new episode object. self._episodes[i] = SingleAgentEpisode(observations=[obs[i]]) @@ -342,9 +370,6 @@ def _sample_timesteps( action=actions[i], reward=rewards[i], ) - is_first[i] = False - - self._states[i] = s # Return done episodes ... self._done_episodes_for_metrics.extend(done_episodes_to_return) @@ -356,9 +381,9 @@ def _sample_timesteps( for eps in ongoing_episodes: self._ongoing_episodes_for_metrics[eps.id_].append(eps) - self._ts_since_last_metrics += ts + self._increase_sampled_metrics(ts) - return done_episodes_to_return, ongoing_episodes + return done_episodes_to_return + ongoing_episodes def _sample_episodes( self, @@ -378,7 +403,7 @@ def _sample_episodes( # Multiply states n times according to our vector env batch size (num_envs). states = tree.map_structure( lambda s: np.repeat(s, self.num_envs, axis=0), - self.module.get_initial_state(), + convert_to_numpy(self.module.get_initial_state()), ) is_first = np.ones((self.num_envs,)) @@ -392,10 +417,10 @@ def _sample_episodes( else: batch = { Columns.STATE_IN: tree.map_structure( - lambda s: tf.convert_to_tensor(s), states + lambda s: self.convert_to_tensor(s), states ), - Columns.OBS: tf.convert_to_tensor(obs), - "is_first": tf.convert_to_tensor(is_first), + Columns.OBS: self.convert_to_tensor(obs), + "is_first": self.convert_to_tensor(is_first), } if explore: @@ -403,12 +428,10 @@ def _sample_episodes( else: outs = self.module.forward_inference(batch) - actions = outs[Columns.ACTIONS].numpy() + actions = convert_to_numpy(outs[Columns.ACTIONS]) if isinstance(self.env.single_action_space, gym.spaces.Discrete): actions = np.argmax(actions, axis=-1) - states = tree.map_structure( - lambda s: s.numpy(), outs[Columns.STATE_OUT] - ) + states = convert_to_numpy(outs[Columns.STATE_OUT]) obs, rewards, terminateds, truncateds, infos = self.env.step(actions) @@ -434,8 +457,10 @@ def _sample_episodes( # Reset h-states to the model's initial ones b/c we are starting a # new episode. - for k, v in self.module.get_initial_state().items(): - states[k][i] = v.numpy() + for k, v in convert_to_numpy( + self.module.get_initial_state() + ).items(): + states[k][i] = v is_first[i] = True episodes[i] = SingleAgentEpisode(observations=[obs[i]]) @@ -448,41 +473,50 @@ def _sample_episodes( is_first[i] = False self._done_episodes_for_metrics.extend(done_episodes_to_return) - self._ts_since_last_metrics += sum(len(eps) for eps in done_episodes_to_return) # If user calls sample(num_timesteps=..) after this, we must reset again # at the beginning. self._needs_initial_reset = True + ts = sum(map(len, done_episodes_to_return)) + self._increase_sampled_metrics(ts) + return done_episodes_to_return - # TODO (sven): Remove the requirement for EnvRunners/RolloutWorkers to have this - # API. Instead Algorithm should compile episode metrics itself via its local - # buffer. - def get_metrics(self) -> List[RolloutMetrics]: + def get_metrics(self) -> ResultDict: # Compute per-episode metrics (only on already completed episodes). - metrics = [] for eps in self._done_episodes_for_metrics: + assert eps.is_done + episode_length = len(eps) - episode_reward = eps.get_return() + episode_return = eps.get_return() + episode_duration_s = eps.get_duration_s() + # Don't forget about the already returned chunks of this episode. if eps.id_ in self._ongoing_episodes_for_metrics: for eps2 in self._ongoing_episodes_for_metrics[eps.id_]: episode_length += len(eps2) - episode_reward += eps2.get_return() + episode_return += eps2.get_return() del self._ongoing_episodes_for_metrics[eps.id_] - metrics.append( - RolloutMetrics( - episode_length=episode_length, - episode_reward=episode_reward, - ) + self._log_episode_metrics( + episode_length, episode_return, episode_duration_s ) + # Log num episodes counter for this iteration. + self.metrics.log_value( + NUM_EPISODES, + len(self._done_episodes_for_metrics), + reduce="sum", + # Reset internal data on `reduce()` call below (not a lifetime count). + clear_on_reduce=True, + ) + + # Now that we have logged everything, clear cache of done episodes. self._done_episodes_for_metrics.clear() - self._ts_since_last_metrics = 0 - return metrics + # Return reduced metrics. + return self.metrics.reduce() # TODO (sven): Remove the requirement for EnvRunners/RolloutWorkers to have this # API. Replace by proper state overriding via `EnvRunner.set_state()` @@ -503,6 +537,52 @@ def stop(self): # Close our env object via gymnasium's API. self.env.close() + def _increase_sampled_metrics(self, num_steps): + # Per sample cycle stats. + self.metrics.log_value( + NUM_ENV_STEPS_SAMPLED, num_steps, reduce="sum", clear_on_reduce=True + ) + self.metrics.log_value( + (NUM_AGENT_STEPS_SAMPLED, DEFAULT_AGENT_ID), + num_steps, + reduce="sum", + clear_on_reduce=True, + ) + self.metrics.log_value( + (NUM_MODULE_STEPS_SAMPLED, DEFAULT_MODULE_ID), + num_steps, + reduce="sum", + clear_on_reduce=True, + ) + # Lifetime stats. + self.metrics.log_value(NUM_ENV_STEPS_SAMPLED_LIFETIME, num_steps, reduce="sum") + self.metrics.log_value( + (NUM_AGENT_STEPS_SAMPLED_LIFETIME, DEFAULT_AGENT_ID), + num_steps, + reduce="sum", + ) + self.metrics.log_value( + (NUM_MODULE_STEPS_SAMPLED_LIFETIME, DEFAULT_MODULE_ID), + num_steps, + reduce="sum", + ) + return num_steps + + def _log_episode_metrics(self, length, ret, sec): + # Log general episode metrics. + # To mimick the old API stack behavior, we'll use `window` here for + # these particular stats (instead of the default EMA). + win = self.config.metrics_num_episodes_for_smoothing + self.metrics.log_value(EPISODE_LEN_MEAN, length, window=win) + self.metrics.log_value(EPISODE_RETURN_MEAN, ret, window=win) + self.metrics.log_value(EPISODE_DURATION_SEC_MEAN, sec, window=win) + + # For some metrics, log min/max as well. + self.metrics.log_value(EPISODE_LEN_MIN, length, reduce="min") + self.metrics.log_value(EPISODE_RETURN_MIN, ret, reduce="min") + self.metrics.log_value(EPISODE_LEN_MAX, length, reduce="max") + self.metrics.log_value(EPISODE_RETURN_MAX, ret, reduce="max") + class NormalizedImageEnv(gym.ObservationWrapper): def __init__(self, *args, **kwargs): diff --git a/rllib/algorithms/dreamerv3/utils/summaries.py b/rllib/algorithms/dreamerv3/utils/summaries.py index f78876c83fe7..dd36adbb3160 100644 --- a/rllib/algorithms/dreamerv3/utils/summaries.py +++ b/rllib/algorithms/dreamerv3/utils/summaries.py @@ -13,16 +13,16 @@ create_cartpole_dream_image, create_frozenlake_dream_image, ) +from ray.rllib.core import DEFAULT_MODULE_ID from ray.rllib.core.columns import Columns +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.metrics import ( + LEARNER_RESULTS, + REPLAY_BUFFER_RESULTS, +) from ray.rllib.utils.tf_utils import inverse_symlog - -def _summarize(*, results, data_to_summarize, keys_to_log, include_histograms=False): - for k in keys_to_log: - if data_to_summarize[k].shape == (): - results.update({k: data_to_summarize[k]}) - elif include_histograms: - results.update({k: data_to_summarize[k]}) +torch, _ = try_import_torch() def reconstruct_obs_from_h_and_z( @@ -39,10 +39,18 @@ def reconstruct_obs_from_h_and_z( # Note that the last h-state (T+1) is NOT used here as it's already part of # a new trajectory. # Use mean() of the Gaussian, no sample! -> No need to construct dist object here. - reconstructed_obs_distr_means_TxB = dreamer_model.world_model.decoder( - # Fold time rank. - h=np.reshape(h_t0_to_H, (T * B, -1)), - z=np.reshape(z_t0_to_H, (T * B,) + z_t0_to_H.shape[2:]), + device = next(iter(dreamer_model.world_model.decoder.parameters())).device + reconstructed_obs_distr_means_TxB = ( + dreamer_model.world_model.decoder( + # Fold time rank. + h=torch.from_numpy(h_t0_to_H).reshape((T * B, -1)).to(device), + z=torch.from_numpy(z_t0_to_H) + .reshape((T * B,) + z_t0_to_H.shape[2:]) + .to(device), + ) + .detach() + .cpu() + .numpy() ) # Unfold time rank again. reconstructed_obs_T_B = np.reshape( @@ -115,11 +123,12 @@ def report_dreamed_trajectory( def report_predicted_vs_sampled_obs( *, - results, + metrics, sample, batch_size_B, batch_length_T, symlog_obs: bool = True, + do_report: bool = True, ): """Summarizes sampled data (from the replay buffer) vs world-model predictions. @@ -133,127 +142,188 @@ def report_predicted_vs_sampled_obs( Continues: Compute MSE (sampled vs predicted). Args: - results: The results dict that was returned by - `LearnerGroup.update_from_batch()`. + metrics: The MetricsLogger object of the DreamerV3 algo. sample: The sampled data (dict) from the replay buffer. Already tf-tensor converted. batch_size_B: The batch size (B). This is the number of trajectories sampled from the buffer. batch_length_T: The batch length (T). This is the length of an individual trajectory sampled from the buffer. + do_report: Whether to actually log the report (default). If this is set to + False, this function serves as a clean-up on the given metrics, making sure + they do NOT contain anymore any (spacious) data relevant for producing + the report/videos. """ - predicted_observation_means_BxT = results[ - "WORLD_MODEL_fwd_out_obs_distribution_means_BxT" - ] + fwd_output_key = ( + LEARNER_RESULTS, + DEFAULT_MODULE_ID, + "WORLD_MODEL_fwd_out_obs_distribution_means_b0xT", + ) + # logged as a non-reduced item (still a list) + predicted_observation_means_single_example = metrics.peek( + fwd_output_key, default=[None] + )[-1] + metrics.delete(fwd_output_key, key_error=False) + + final_result_key = ( + f"WORLD_MODEL_sampled_vs_predicted_posterior_b0x{batch_length_T}_videos" + ) + if not do_report: + metrics.delete(final_result_key, key_error=False) + return + _report_obs( - results=results, + metrics=metrics, computed_float_obs_B_T_dims=np.reshape( - predicted_observation_means_BxT, - (batch_size_B, batch_length_T) + sample[Columns.OBS].shape[2:], + predicted_observation_means_single_example, + # WandB videos need to be channels first. + (1, batch_length_T) + sample[Columns.OBS].shape[2:], ), - sampled_obs_B_T_dims=sample[Columns.OBS], - descr_prefix="WORLD_MODEL", - descr_obs=f"predicted_posterior_T{batch_length_T}", + sampled_obs_B_T_dims=sample[Columns.OBS][0:1], + metrics_key=final_result_key, symlog_obs=symlog_obs, ) def report_dreamed_eval_trajectory_vs_samples( *, - results, - dream_data, + metrics, sample, burn_in_T, dreamed_T, dreamer_model, symlog_obs: bool = True, -): + do_report: bool = True, +) -> None: + """Logs dreamed observations, rewards, continues and compares them vs sampled data. + + For obs, we'll try to create videos (side-by-side comparison) of the dreamed, + recreated-from-prior obs vs the sampled ones (over dreamed_T timesteps). + + Args: + metrics: The MetricsLogger object of the DreamerV3 algo. + sample: The sampled data (dict) from the replay buffer. Already tf-tensor + converted. + burn_in_T: The number of burn-in timesteps (these will be skipped over in the + reported video comparisons and MSEs). + dreamed_T: The number of timesteps to produce dreamed data for. + dreamer_model: The DreamerModel to use to create observation vectors/images + from dreamed h- and (prior) z-states. + symlog_obs: Whether to inverse-symlog the computed observations or not. Set this + to True for environments, in which we should symlog the observations. + do_report: Whether to actually log the report (default). If this is set to + False, this function serves as a clean-up on the given metrics, making sure + they do NOT contain anymore any (spacious) data relevant for producing + the report/videos. + """ + dream_data = metrics.peek( + LEARNER_RESULTS, + DEFAULT_MODULE_ID, + "dream_data", + default={}, + ) + metrics.delete(LEARNER_RESULTS, DEFAULT_MODULE_ID, "dream_data", key_error=False) + + final_result_key_obs = f"EVALUATION_sampled_vs_dreamed_prior_H{dreamed_T}_obs" + final_result_key_rew = ( + f"EVALUATION_sampled_vs_dreamed_prior_H{dreamed_T}_rewards_MSE" + ) + final_result_key_cont = ( + f"EVALUATION_sampled_vs_dreamed_prior_H{dreamed_T}_continues_MSE" + ) + if not do_report: + metrics.delete(final_result_key_obs, key_error=False) + metrics.delete(final_result_key_rew, key_error=False) + metrics.delete(final_result_key_cont, key_error=False) + return + # Obs MSE. - dreamed_obs_T_B = reconstruct_obs_from_h_and_z( - h_t0_to_H=dream_data["h_states_t0_to_H_BxT"], - z_t0_to_H=dream_data["z_states_prior_t0_to_H_BxT"], + dreamed_obs_H_B = reconstruct_obs_from_h_and_z( + h_t0_to_H=dream_data["h_states_t0_to_H_Bx1"][0], + z_t0_to_H=dream_data["z_states_prior_t0_to_H_Bx1"][0], dreamer_model=dreamer_model, obs_dims_shape=sample[Columns.OBS].shape[2:], ) - t0 = burn_in_T - 1 + t0 = burn_in_T tH = t0 + dreamed_T # Observation MSE and - if applicable - images comparisons. - mse_sampled_vs_dreamed_obs = _report_obs( - results=results, - # Have to transpose b/c dreamed data is time-major. - computed_float_obs_B_T_dims=np.transpose( - dreamed_obs_T_B, - axes=[1, 0] + list(range(2, len(dreamed_obs_T_B.shape))), - ), - sampled_obs_B_T_dims=sample[Columns.OBS][:, t0 : tH + 1], - descr_prefix="EVALUATION", - descr_obs=f"dreamed_prior_H{dreamed_T}", + _report_obs( + metrics=metrics, + # WandB videos need to be 5D (B, L, c, h, w) -> transpose/swap H and B axes. + computed_float_obs_B_T_dims=np.swapaxes(dreamed_obs_H_B, 0, 1)[ + 0:1 + ], # for now: only B=1 + sampled_obs_B_T_dims=sample[Columns.OBS][0:1, t0:tH], + metrics_key=final_result_key_obs, symlog_obs=symlog_obs, ) # Reward MSE. _report_rewards( - results=results, - computed_rewards=dream_data["rewards_dreamed_t0_to_H_BxT"], - sampled_rewards=sample[Columns.REWARDS][:, t0 : tH + 1], - descr_prefix="EVALUATION", - descr_reward=f"dreamed_prior_H{dreamed_T}", + metrics=metrics, + computed_rewards=dream_data["rewards_dreamed_t0_to_H_Bx1"][0], + sampled_rewards=sample[Columns.REWARDS][:, t0:tH], + metrics_key=final_result_key_rew, ) # Continues MSE. _report_continues( - results=results, - computed_continues=dream_data["continues_dreamed_t0_to_H_BxT"], - sampled_continues=(1.0 - sample["is_terminated"])[:, t0 : tH + 1], - descr_prefix="EVALUATION", - descr_cont=f"dreamed_prior_H{dreamed_T}", + metrics=metrics, + computed_continues=dream_data["continues_dreamed_t0_to_H_Bx1"][0], + sampled_continues=(1.0 - sample["is_terminated"])[:, t0:tH], + metrics_key=final_result_key_cont, ) - return mse_sampled_vs_dreamed_obs -def report_sampling_and_replay_buffer(*, replay_buffer): +def report_sampling_and_replay_buffer(*, metrics, replay_buffer): episodes_in_buffer = replay_buffer.get_num_episodes() ts_in_buffer = replay_buffer.get_num_timesteps() replayed_steps = replay_buffer.get_sampled_timesteps() added_steps = replay_buffer.get_added_timesteps() # Summarize buffer, sampling, and train ratio stats. - return { - "BUFFER_capacity": replay_buffer.capacity, - "BUFFER_size_num_episodes": episodes_in_buffer, - "BUFFER_size_timesteps": ts_in_buffer, - "BUFFER_replayed_steps": replayed_steps, - "BUFFER_added_steps": added_steps, - } + metrics.log_dict( + { + "capacity": replay_buffer.capacity, + "size_num_episodes": episodes_in_buffer, + "size_timesteps": ts_in_buffer, + "replayed_steps": replayed_steps, + "added_steps": added_steps, + }, + key=REPLAY_BUFFER_RESULTS, + window=1, + ) # window=1 b/c these are current (total count/state) values. def _report_obs( *, - results, + metrics, computed_float_obs_B_T_dims, sampled_obs_B_T_dims, - descr_prefix=None, - descr_obs, + metrics_key, symlog_obs, ): """Summarizes computed- vs sampled observations: MSE and (if applicable) images. Args: + metrics: The MetricsLogger object of the DreamerV3 algo. computed_float_obs_B_T_dims: Computed float observations (not clipped, not cast'd). Shape=(B, T, [dims ...]). sampled_obs_B_T_dims: Sampled observations (as-is from the environment, meaning this could be uint8, 0-255 clipped images). Shape=(B, T, [dims ...]). - B: The batch size B (see shapes of `computed_float_obs_B_T_dims` and - `sampled_obs_B_T_dims` above). - T: The batch length T (see shapes of `computed_float_obs_B_T_dims` and - `sampled_obs_B_T_dims` above). - descr: A string used to describe the computed data to be used in the TB - summaries. + metrics_key: The metrics key (or key sequence) under which to log ths resulting + video sequence. + symlog_obs: Whether to inverse-symlog the computed observations or not. Set this + to True for environments, in which we should symlog the observations. + """ # Videos: Create summary, comparing computed images with actual sampled ones. # 4=[B, T, w, h] grayscale image; 5=[B, T, w, h, C] RGB image. if len(sampled_obs_B_T_dims.shape) in [4, 5]: - descr_prefix = (descr_prefix + "_") if descr_prefix else "" + # WandB videos need to be channels first. + transpose_axes = ( + (0, 1, 4, 2, 3) if len(sampled_obs_B_T_dims.shape) == 5 else (0, 3, 1, 2) + ) if symlog_obs: computed_float_obs_B_T_dims = inverse_symlog(computed_float_obs_B_T_dims) @@ -265,68 +335,63 @@ def _report_obs( sampled_obs_B_T_dims = np.clip(sampled_obs_B_T_dims, 0.0, 255.0).astype( np.uint8 ) + sampled_obs_B_T_dims = np.transpose(sampled_obs_B_T_dims, transpose_axes) computed_images = np.clip(computed_float_obs_B_T_dims, 0.0, 255.0).astype( np.uint8 ) + computed_images = np.transpose(computed_images, transpose_axes) # Concat sampled and computed images along the height axis (3) such that # real images show below respective predicted ones. # (B, T, C, h, w) sampled_vs_computed_images = np.concatenate( [computed_images, sampled_obs_B_T_dims], - axis=3, + axis=-1, # concat on width axis (looks nicer) ) # Add grayscale dim, if necessary. if len(sampled_obs_B_T_dims.shape) == 2 + 2: sampled_vs_computed_images = np.expand_dims(sampled_vs_computed_images, -1) - results.update( - {f"{descr_prefix}sampled_vs_{descr_obs}_videos": sampled_vs_computed_images} + metrics.log_value( + metrics_key, + sampled_vs_computed_images, + reduce=None, # No reduction, we want the obs tensor to stay in-tact. + window=1, ) - # return mse_sampled_vs_computed_obs - def _report_rewards( *, - results, + metrics, computed_rewards, sampled_rewards, - descr_prefix=None, - descr_reward, + metrics_key, ): - descr_prefix = (descr_prefix + "_") if descr_prefix else "" mse_sampled_vs_computed_rewards = np.mean( np.square(computed_rewards - sampled_rewards) ) mse_sampled_vs_computed_rewards = np.mean(mse_sampled_vs_computed_rewards) - results.update( - { - f"{descr_prefix}sampled_vs_{descr_reward}_rewards_mse": ( - mse_sampled_vs_computed_rewards - ), - } + metrics.log_value( + metrics_key, + mse_sampled_vs_computed_rewards, + window=1, ) def _report_continues( *, - results, + metrics, computed_continues, sampled_continues, - descr_prefix=None, - descr_cont, + metrics_key, ): - descr_prefix = (descr_prefix + "_") if descr_prefix else "" # Continue MSE. mse_sampled_vs_computed_continues = np.mean( np.square( computed_continues - sampled_continues.astype(computed_continues.dtype) ) ) - results.update( - { - f"{descr_prefix}sampled_vs_{descr_cont}_continues_mse": ( - mse_sampled_vs_computed_continues - ), - } + metrics.log_value( + metrics_key, + mse_sampled_vs_computed_continues, + window=1, ) diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py index e1d414c3179b..b506dcf546aa 100644 --- a/rllib/algorithms/impala/impala.py +++ b/rllib/algorithms/impala/impala.py @@ -37,6 +37,7 @@ NUM_AGENT_STEPS_SAMPLED, NUM_AGENT_STEPS_TRAINED, NUM_ENV_STEPS_SAMPLED, + NUM_ENV_STEPS_SAMPLED_LIFETIME, NUM_ENV_STEPS_TRAINED, NUM_MODULE_STEPS_TRAINED, NUM_SYNCH_WORKER_WEIGHTS, @@ -720,6 +721,7 @@ def training_step(self) -> ResultDict: if self.config.enable_rl_module_and_learner: train_results = self.learn_on_processed_samples() module_ids_to_update = set(train_results.keys()) - {ALL_MODULES} + # TODO (sven): Move to Learner._after_gradient_based_update(). additional_results = self.learner_group.additional_update( module_ids_to_update=module_ids_to_update, timestep=self._counters[ @@ -959,6 +961,11 @@ def learn_on_processed_samples(self) -> ResultDict: for batch in batches: result = self.learner_group.update_from_batch( batch=batch, + timesteps={ + NUM_ENV_STEPS_SAMPLED_LIFETIME: ( + self.metrics.peek(NUM_ENV_STEPS_SAMPLED_LIFETIME) + ), + }, async_update=async_update, num_iters=self.config.num_sgd_iter, minibatch_size=self.config.minibatch_size, diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py index ce491f9ca09d..60dfe4b6eed6 100644 --- a/rllib/algorithms/ppo/ppo.py +++ b/rllib/algorithms/ppo/ppo.py @@ -479,6 +479,11 @@ def _training_step_new_api_stack(self) -> ResultDict: with self.metrics.log_time((TIMERS, LEARNER_UPDATE_TIMER)): learner_results = self.learner_group.update_from_episodes( episodes=episodes, + timesteps={ + NUM_ENV_STEPS_SAMPLED_LIFETIME: ( + self.metrics.peek(NUM_ENV_STEPS_SAMPLED_LIFETIME) + ), + }, minibatch_size=( self.config.mini_batch_size_per_learner or self.config.sgd_minibatch_size @@ -515,7 +520,6 @@ def _training_step_new_api_stack(self) -> ResultDict: # Sync weights from learner_group to all rollout workers. from_worker_or_learner_group=self.learner_group, policies=modules_to_update, - global_vars=None, inference_only=True, ) else: @@ -542,7 +546,8 @@ def _training_step_new_api_stack(self) -> ResultDict: ) kl_dict[mid] = kl - # triggers a special update method on RLOptimizer to update the KL values. + # TODO (sven): Move to Learner._after_gradient_based_update(). + # Triggers a special update method on RLOptimizer to update the KL values. additional_results = self.learner_group.additional_update( module_ids_to_update=modules_to_update, sampled_kl_values=kl_dict, diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py b/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py index f3defd5f7520..f7c89f167f8b 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py +++ b/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py @@ -7,9 +7,7 @@ ) from ray.rllib.algorithms.callbacks import DefaultCallbacks from ray.rllib.core import DEFAULT_MODULE_ID -from ray.rllib.core.learner.learner import ( - LEARNER_RESULTS_CURR_LR_KEY, -) +from ray.rllib.core.learner.learner import DEFAULT_OPTIMIZER, LR_KEY from ray.rllib.utils.metrics import LEARNER_RESULTS from ray.rllib.utils.test_utils import ( @@ -44,7 +42,7 @@ def on_train_result(self, *, algorithm, result: dict, **kwargs): # Learning rate should decrease by 0.0001/4 per iteration. check( - stats[LEARNER_RESULTS_CURR_LR_KEY], + stats[DEFAULT_OPTIMIZER + "_" + LR_KEY], 0.0000075 if algorithm.iteration == 1 else 0.000005, ) # Compare reported curr lr vs the actual lr found in the optimizer object. @@ -54,7 +52,7 @@ def on_train_result(self, *, algorithm, result: dict, **kwargs): if algorithm.config.framework_str == "torch" else optim.lr ) - check(stats[LEARNER_RESULTS_CURR_LR_KEY], actual_optimizer_lr) + check(stats[DEFAULT_OPTIMIZER + "_" + LR_KEY], actual_optimizer_lr) class TestPPO(unittest.TestCase): diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py index 7c89be47b189..4f9b32041829 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py +++ b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py @@ -10,9 +10,7 @@ from ray.rllib.algorithms.callbacks import DefaultCallbacks from ray.rllib.algorithms.ppo.tests.test_ppo import PENDULUM_FAKE_BATCH from ray.rllib.core import DEFAULT_MODULE_ID -from ray.rllib.core.learner.learner import ( - LEARNER_RESULTS_CURR_LR_KEY, -) +from ray.rllib.core.learner.learner import DEFAULT_OPTIMIZER, LR_KEY from ray.rllib.evaluation.postprocessing import ( compute_gae_for_sample_batch, ) @@ -50,7 +48,7 @@ def on_train_result(self, *, algorithm, result: dict, **kwargs): # Learning rate should decrease by 0.0001/4 per iteration. check( - stats[LEARNER_RESULTS_CURR_LR_KEY], + stats[DEFAULT_OPTIMIZER + "_" + LR_KEY], 0.0000075 if algorithm.iteration == 1 else 0.000005, ) # Compare reported curr lr vs the actual lr found in the optimizer object. @@ -60,7 +58,7 @@ def on_train_result(self, *, algorithm, result: dict, **kwargs): if algorithm.config.framework_str == "torch" else optim.lr ) - check(stats[LEARNER_RESULTS_CURR_LR_KEY], actual_optimizer_lr) + check(stats[DEFAULT_OPTIMIZER + "_" + LR_KEY], actual_optimizer_lr) class TestPPO(unittest.TestCase): diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py index 887be4f2bf92..b71877fdf892 100644 --- a/rllib/core/learner/learner.py +++ b/rllib/core/learner/learner.py @@ -43,6 +43,7 @@ from ray.rllib.utils.framework import try_import_tf, try_import_torch from ray.rllib.utils.metrics import ( ALL_MODULES, + NUM_ENV_STEPS_SAMPLED_LIFETIME, NUM_ENV_STEPS_TRAINED, NUM_MODULE_STEPS_TRAINED, ) @@ -85,7 +86,7 @@ ENTROPY_KEY = "entropy" # Additional update keys -LEARNER_RESULTS_CURR_LR_KEY = "curr_lr" +LR_KEY = "learning_rate" @dataclass @@ -287,6 +288,10 @@ def __init__( # and return the resulting (reduced) dict. self.metrics = MetricsLogger() + # TODO (sven): Do we really need this API? It seems like LearnerGroup constructs + # all Learner workers and then immediately builds them any ways? Seems to make + # thing more complicated. Unless there is a reason related to Train worker group + # setup. @OverrideToImplementCustomLogic_CallToSuperRecommended def build(self) -> None: """Builds the Learner. @@ -528,7 +533,6 @@ def postprocess_gradients_for_module( module_id: ModuleID, config: Optional["AlgorithmConfig"] = None, module_gradients_dict: ParamDict, - hps=None, ) -> ParamDict: """Applies postprocessing operations on the gradients of the given module. @@ -547,13 +551,6 @@ def postprocess_gradients_for_module( A dictionary with the updated gradients and the exact same (flat) structure as the incoming `module_gradients_dict` arg. """ - if hps is not None: - deprecation_warning( - old="Learner.postprocess_gradients_for_module(.., hps=..)", - help="Deprecated argument. Use `config` (AlgorithmConfig) instead.", - error=True, - ) - postprocessed_grads = {} if config.grad_clip is None: @@ -1029,7 +1026,6 @@ def additional_update_for_module( module_id: ModuleID, config: Optional["AlgorithmConfig"] = None, timestep: int, - hps=None, **kwargs, ) -> None: """Apply additional non-gradient based updates for a single module. @@ -1045,36 +1041,14 @@ def additional_update_for_module( Returns: A dictionary of results from the update """ - if hps is not None: - deprecation_warning( - old="Learner.additional_update_for_module(.., hps=..)", - help="Deprecated argument. Use `config` (AlgorithmConfig) instead.", - error=True, - ) - - # Only cover the optimizer mapped to this particular module. - for optimizer_name, optimizer in self.get_optimizers_for_module(module_id): - # Only update this optimizer's lr, if a scheduler has been registered - # along with it. - if optimizer in self._optimizer_lr_schedules: - new_lr = self._optimizer_lr_schedules[optimizer].update( - timestep=timestep - ) - self._set_optimizer_lr(optimizer, lr=new_lr) - - # Make sure our returned results differentiate by optimizer name - # (if not the default name). - stats_name = LEARNER_RESULTS_CURR_LR_KEY - if optimizer_name != DEFAULT_OPTIMIZER: - stats_name += "_" + optimizer_name - self.metrics.log_value( - key=(module_id, stats_name), value=new_lr, window=1 - ) + pass def update_from_batch( self, batch: MultiAgentBatch, *, + # TODO (sven): Make this a more formal structure with its own type. + timesteps: Optional[Dict[str, Any]] = None, # TODO (sven): Deprecate these in favor of config attributes for only those # algos that actually need (and know how) to do minibatching. minibatch_size: Optional[int] = None, @@ -1090,6 +1064,9 @@ def update_from_batch( Args: batch: A batch of training data to update from. + timesteps: Timesteps dict, which must have the key + `NUM_ENV_STEPS_SAMPLED_LIFETIME`. + # TODO (sven): Make this a more formal structure with its own type. minibatch_size: The size of the minibatch to use for each update. num_iters: The number of complete passes over all the sub-batches in the input multi-agent batch. @@ -1113,7 +1090,7 @@ def update_from_batch( ) return self._update_from_batch_or_episodes( batch=batch, - episodes=None, + timesteps=timesteps, minibatch_size=minibatch_size, num_iters=num_iters, ) @@ -1122,6 +1099,8 @@ def update_from_episodes( self, episodes: List[EpisodeType], *, + # TODO (sven): Make this a more formal structure with its own type. + timesteps: Optional[Dict[str, Any]] = None, # TODO (sven): Deprecate these in favor of config attributes for only those # algos that actually need (and know how) to do minibatching. minibatch_size: Optional[int] = None, @@ -1138,6 +1117,9 @@ def update_from_episodes( Args: episodes: An list of episode objects to update from. + timesteps: Timesteps dict, which must have the key + `NUM_ENV_STEPS_SAMPLED_LIFETIME`. + # TODO (sven): Make this a more formal structure with its own type. minibatch_size: The size of the minibatch to use for each update. num_iters: The number of complete passes over all the sub-batches in the input multi-agent batch. @@ -1167,8 +1149,8 @@ def update_from_episodes( error=True, ) return self._update_from_batch_or_episodes( - batch=None, episodes=episodes, + timesteps=timesteps, minibatch_size=minibatch_size, num_iters=num_iters, min_total_mini_batches=min_total_mini_batches, @@ -1270,17 +1252,16 @@ def _update_from_batch_or_episodes( # as well for simplicity. batch: Optional[MultiAgentBatch] = None, episodes: Optional[List[EpisodeType]] = None, + # TODO (sven): Make this a more formal structure with its own type. + timesteps: Optional[Dict[str, Any]] = None, # TODO (sven): Deprecate these in favor of config attributes for only those # algos that actually need (and know how) to do minibatching. minibatch_size: Optional[int] = None, num_iters: int = 1, min_total_mini_batches: int = 0, ) -> Union[Dict[str, Any], List[Dict[str, Any]]]: - self._check_is_built() - if num_iters < 1: - # We must do at least one pass on the batch for training. - raise ValueError("`num_iters` must be >= 1") + self._check_is_built() # Call the learner connector. if self._learner_connector is not None and episodes is not None: @@ -1310,6 +1291,7 @@ def _update_from_batch_or_episodes( f"Found IDs: {unknown_module_ids}" ) + # TODO: Move this into LearnerConnector pipeline? # Filter out those RLModules from the final train batch that should not be # updated. for module_id in list(batch.policy_batches.keys()): @@ -1326,14 +1308,10 @@ def _update_from_batch_or_episodes( { (ALL_MODULES, NUM_ENV_STEPS_TRAINED): batch.env_steps(), (ALL_MODULES, NUM_MODULE_STEPS_TRAINED): batch.agent_steps(), - }, - reduce="sum", - clear_on_reduce=True, - ) - self.metrics.log_dict( - { - (mid, NUM_MODULE_STEPS_TRAINED): len(b) - for mid, b in batch.policy_batches.items() + **{ + (mid, NUM_MODULE_STEPS_TRAINED): len(b) + for mid, b in batch.policy_batches.items() + }, }, reduce="sum", clear_on_reduce=True, @@ -1388,11 +1366,48 @@ def _update_from_batch_or_episodes( self._set_slicing_by_batch_id(batch, value=False) + # Call `_after_gradient_based_update` to allow for non-gradient based + # cleanups-, logging-, and update logic to happen. + self._after_gradient_based_update(timesteps) + + # Reduce results across all minibatch update steps. + return self.metrics.reduce() + + @OverrideToImplementCustomLogic_CallToSuperRecommended + def _after_gradient_based_update(self, timesteps: Dict[str, Any]) -> None: + """Called after gradient-based updates are completed. + + Should be overridden to implement custom cleanup-, logging-, or non-gradient- + based Learner/RLModule update logic after(!) gradient-based updates have been + completed. + + Args: + timesteps: Timesteps dict, which must have the key + `NUM_ENV_STEPS_SAMPLED_LIFETIME`. + # TODO (sven): Make this a more formal structure with its own type. + """ + timesteps = timesteps or {} + + # Only update this optimizer's lr, if a scheduler has been registered + # along with it. + for module_id, optimizer_names in self._module_optimizers.items(): + for optimizer_name in optimizer_names: + optimizer = self._named_optimizers[optimizer_name] + lr_schedule = self._optimizer_lr_schedules.get(optimizer) + if lr_schedule is None: + continue + new_lr = lr_schedule.update( + timestep=timesteps.get(NUM_ENV_STEPS_SAMPLED_LIFETIME, 0) + ) + self._set_optimizer_lr(optimizer, lr=new_lr) + # Log all current learning rates of all our optimizers (registered under the # different ModuleIDs). self.metrics.log_dict( { - (mid, f"{full_name[len(mid) + 1 :]}_lr"): convert_to_numpy( + # Cut out the module ID from the beginning since it's already part of + # the key sequence: (ModuleID, "[optim name]_lr"). + (mid, f"{full_name[len(mid) + 1:]}_{LR_KEY}"): convert_to_numpy( self._get_optimizer_lr(self._named_optimizers[full_name]) ) for mid, full_names in self._module_optimizers.items() @@ -1401,9 +1416,6 @@ def _update_from_batch_or_episodes( window=1, ) - # Reduce results across all minibatch update steps. - return self.metrics.reduce() - def _set_slicing_by_batch_id( self, batch: MultiAgentBatch, *, value: bool ) -> MultiAgentBatch: @@ -1456,7 +1468,6 @@ def _save_optimizers(self, path: Union[str, pathlib.Path]) -> None: Args: path: The path to the directory to save the state to. - """ pass @@ -1465,7 +1476,6 @@ def _load_optimizers(self, path: Union[str, pathlib.Path]) -> None: Args: path: The path to the directory to load the state from. - """ pass @@ -1491,7 +1501,6 @@ def save_state(self, path: Union[str, pathlib.Path]) -> None: Args: path: The path to the directory to save the state to. - """ self._check_is_built() path = pathlib.Path(path) diff --git a/rllib/core/learner/learner_group.py b/rllib/core/learner/learner_group.py index acc42ef83dea..5496cb0ce5ec 100644 --- a/rllib/core/learner/learner_group.py +++ b/rllib/core/learner/learner_group.py @@ -220,6 +220,7 @@ def update_from_batch( self, batch: MultiAgentBatch, *, + timesteps: Optional[Dict[str, Any]] = None, async_update: bool = False, # TODO (sven): Deprecate the following args. They should be extracted from # self.config of those specific algorithms that actually require these @@ -266,7 +267,7 @@ def update_from_batch( ) return self._update( batch=batch, - episodes=None, + timesteps=timesteps, async_update=async_update, minibatch_size=minibatch_size, num_iters=num_iters, @@ -276,6 +277,7 @@ def update_from_episodes( self, episodes: List[EpisodeType], *, + timesteps: Optional[Dict[str, Any]] = None, async_update: bool = False, # TODO (sven): Deprecate the following args. They should be extracted from # self.config of those specific algorithms that actually require these @@ -322,8 +324,8 @@ def update_from_episodes( ) return self._update( - batch=None, episodes=episodes, + timesteps=timesteps, async_update=async_update, minibatch_size=minibatch_size, num_iters=num_iters, @@ -334,6 +336,7 @@ def _update( *, batch: Optional[MultiAgentBatch] = None, episodes: Optional[List[EpisodeType]] = None, + timesteps: Optional[Dict[str, Any]] = None, async_update: bool = False, minibatch_size: Optional[int] = None, num_iters: int = 1, @@ -342,22 +345,25 @@ def _update( # Define function to be called on all Learner actors (or the local learner). def _learner_update( learner: Learner, - batch_shard=None, - episodes_shard=None, - min_total_mini_batches=0, + _batch_shard=None, + _episodes_shard=None, + _timesteps=None, + _min_total_mini_batches=0, ): - if batch_shard is not None: + if _batch_shard is not None: return learner.update_from_batch( - batch=batch_shard, + batch=_batch_shard, + timesteps=_timesteps, minibatch_size=minibatch_size, num_iters=num_iters, ) else: return learner.update_from_episodes( - episodes=episodes_shard, + episodes=_episodes_shard, + timesteps=_timesteps, minibatch_size=minibatch_size, num_iters=num_iters, - min_total_mini_batches=min_total_mini_batches, + min_total_mini_batches=_min_total_mini_batches, ) # Local Learner worker: Don't shard batch/episodes, just run data as-is through @@ -372,8 +378,9 @@ def _learner_update( results = [ _learner_update( learner=self._learner, - batch_shard=batch, - episodes_shard=episodes, + _batch_shard=batch, + _episodes_shard=episodes, + _timesteps=timesteps, ) ] # One or more remote Learners: Shard batch/episodes into equal pieces (roughly @@ -387,7 +394,9 @@ def _learner_update( # "lockstep"), the `ShardBatchIterator` should not be used. if episodes is None: partials = [ - partial(_learner_update, batch_shard=batch_shard) + partial( + _learner_update, _batch_shard=batch_shard, _timesteps=timesteps + ) for batch_shard in ShardBatchIterator(batch, len(self._workers)) ] # Single- or MultiAgentEpisodes: Shard into equal pieces (only roughly equal @@ -424,8 +433,9 @@ def _learner_update( partials = [ partial( _learner_update, - episodes_shard=eps_shard, - min_total_mini_batches=min_total_mini_batches, + _episodes_shard=eps_shard, + _timesteps=timesteps, + _min_total_mini_batches=min_total_mini_batches, ) for eps_shard in eps_shards ] diff --git a/rllib/core/learner/torch/torch_learner.py b/rllib/core/learner/torch/torch_learner.py index cacce6f90b06..6dc586c12077 100644 --- a/rllib/core/learner/torch/torch_learner.py +++ b/rllib/core/learner/torch/torch_learner.py @@ -136,7 +136,7 @@ def compute_gradients( self, loss_per_module: Dict[ModuleID, TensorType], **kwargs ) -> ParamDict: for optim in self._optimizer_parameters: - # set_to_none is a faster way to zero out the gradients + # `set_to_none=True` is a faster way to zero out the gradients. optim.zero_grad(set_to_none=True) loss_per_module[ALL_MODULES].backward() grads = {pid: p.grad for pid, p in self._params.items()} diff --git a/rllib/core/models/torch/heads.py b/rllib/core/models/torch/heads.py index 95f61e58c4fd..d634ff2ef4c8 100644 --- a/rllib/core/models/torch/heads.py +++ b/rllib/core/models/torch/heads.py @@ -218,7 +218,7 @@ def __init__(self, config: CNNTransposeHeadConfig) -> None: if initial_dense_weights_initializer: initial_dense_weights_initializer( self.initial_dense.weight, - **config.initial_dense_initializer_config or {}, + **config.initial_dense_weights_initializer_config or {}, ) # Initialized dense layer bais, if necessary. if initial_dense_bias_initializer: diff --git a/rllib/core/models/torch/primitives.py b/rllib/core/models/torch/primitives.py index a8b8d9b27fd2..0be093730279 100644 --- a/rllib/core/models/torch/primitives.py +++ b/rllib/core/models/torch/primitives.py @@ -161,7 +161,8 @@ def __init__( # Insert a layer normalization in between layer's output and # the activation. if hidden_layer_use_layernorm: - layers.append(nn.LayerNorm(dims[i + 1])) + # We use an epsilon of 0.001 here to mimick the Tf default behavior. + layers.append(nn.LayerNorm(dims[i + 1], eps=0.001)) # Add the activation function. if hidden_activation is not None: layers.append(hidden_activation()) @@ -294,7 +295,8 @@ def __init__( # Layernorm. if cnn_use_layernorm: - layers.append(nn.LayerNorm((out_depth, out_size[0], out_size[1]))) + # We use an epsilon of 0.001 here to mimick the Tf default behavior. + layers.append(LayerNorm1D(out_depth, eps=0.001)) # Activation. if cnn_activation is not None: layers.append(cnn_activation()) @@ -446,7 +448,7 @@ def __init__( layers.append(layer) # Layernorm (never for final layer). if cnn_transpose_use_layernorm and not is_final_layer: - layers.append(nn.LayerNorm((out_depth, out_size[0], out_size[1]))) + layers.append(LayerNorm1D(out_depth, eps=0.001)) # Last layer is never activated (regardless of config). if cnn_transpose_activation is not None and not is_final_layer: layers.append(cnn_transpose_activation()) @@ -464,3 +466,20 @@ def forward(self, inputs): out = inputs.permute(0, 3, 1, 2) out = self.cnn_transpose(out.type(self.expected_input_dtype)) return out.permute(0, 2, 3, 1) + + +class LayerNorm1D(nn.Module): + def __init__(self, num_features, **kwargs): + super().__init__() + self.layer_norm = nn.LayerNorm(num_features, **kwargs) + + def forward(self, x): + # x shape: (B, dim, dim, channels). + batch_size, channels, h, w = x.size() + # Reshape to (batch_size * height * width, channels) for LayerNorm + x = x.permute(0, 2, 3, 1).reshape(-1, channels) + # Apply LayerNorm + x = self.layer_norm(x) + # Reshape back to (batch_size, dim, dim, channels) + x = x.reshape(batch_size, h, w, channels).permute(0, 3, 1, 2) + return x diff --git a/rllib/core/models/torch/utils.py b/rllib/core/models/torch/utils.py index f9da0adab31a..1bdbdef016f4 100644 --- a/rllib/core/models/torch/utils.py +++ b/rllib/core/models/torch/utils.py @@ -45,28 +45,33 @@ def __init__(self, width, height, stride_w, stride_h): self.stride_w = stride_w self.stride_h = stride_h - self.zeros = torch.zeros( - size=( - self.width * self.stride_w - (self.stride_w - 1), - self.height * self.stride_h - (self.stride_h - 1), + self.register_buffer( + "zeros", + torch.zeros( + size=( + self.width * self.stride_w - (self.stride_w - 1), + self.height * self.stride_h - (self.stride_h - 1), + ), + dtype=torch.float32, ), - dtype=torch.float32, ) + self.out_width, self.out_height = self.zeros.shape[0], self.zeros.shape[1] # Squeeze in batch and channel dims. self.zeros = self.zeros.unsqueeze(0).unsqueeze(0) - self.where_template = torch.zeros( + where_template = torch.zeros( (self.stride_w, self.stride_h), dtype=torch.float32 ) # Set upper/left corner to 1.0. - self.where_template[0][0] = 1.0 + where_template[0][0] = 1.0 # then tile across the entire (strided) image size. - self.where_template = self.where_template.repeat((self.height, self.width))[ + where_template = where_template.repeat((self.height, self.width))[ : -(self.stride_w - 1), : -(self.stride_h - 1) ] # Squeeze in batch and channel dims and convert to bool. - self.where_template = self.where_template.unsqueeze(0).unsqueeze(0).bool() + where_template = where_template.unsqueeze(0).unsqueeze(0).bool() + self.register_buffer("where_template", where_template) def forward(self, x): # Repeat incoming image stride(w/h) times to match the strided output template. diff --git a/rllib/core/rl_module/torch/torch_rl_module.py b/rllib/core/rl_module/torch/torch_rl_module.py index 9cb4d2bda6c4..0ced41878552 100644 --- a/rllib/core/rl_module/torch/torch_rl_module.py +++ b/rllib/core/rl_module/torch/torch_rl_module.py @@ -110,7 +110,8 @@ def get_state(self, inference_only: bool = False) -> Mapping[str, Any]: @override(RLModule) def set_state(self, state_dict: Mapping[str, Any]) -> None: - self.load_state_dict(convert_to_torch_tensor(state_dict)) + state_dict = convert_to_torch_tensor(state_dict) + self.load_state_dict(state_dict) def _module_state_file_name(self) -> pathlib.Path: return pathlib.Path("module_state.pt") diff --git a/rllib/examples/catalogs/mobilenet_v2_encoder.py b/rllib/examples/catalogs/mobilenet_v2_encoder.py index beebdb79f773..ca44215b8bef 100644 --- a/rllib/examples/catalogs/mobilenet_v2_encoder.py +++ b/rllib/examples/catalogs/mobilenet_v2_encoder.py @@ -44,7 +44,10 @@ def _get_encoder_config( # Create a generic config with our enhanced Catalog ppo_config = ( PPOConfig() - .api_stack(enable_rl_module_and_learner=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .rl_module( rl_module_spec=SingleAgentRLModuleSpec( catalog_class=MobileNetEnhancedPPOCatalog diff --git a/rllib/examples/evaluation/custom_evaluation.py b/rllib/examples/evaluation/custom_evaluation.py index d396ffee04df..76aad3eccdf4 100644 --- a/rllib/examples/evaluation/custom_evaluation.py +++ b/rllib/examples/evaluation/custom_evaluation.py @@ -149,6 +149,7 @@ def custom_eval_function( eval_results = algorithm.metrics.reduce( key=(EVALUATION_RESULTS, ENV_RUNNER_RESULTS) ) + # Alternatively, you could manually reduce over the n returned `env_runner_metrics` # dicts, but this would be much harder as you might not know, which metrics # to sum up, which ones to average over, etc.. diff --git a/rllib/execution/rollout_ops.py b/rllib/execution/rollout_ops.py index 4b3582b9968b..a42a1991453d 100644 --- a/rllib/execution/rollout_ops.py +++ b/rllib/execution/rollout_ops.py @@ -24,6 +24,7 @@ def synchronous_parallel_sample( max_env_steps: Optional[int] = None, concat: bool = True, sample_timeout_s: Optional[float] = 60.0, + random_actions: bool = False, _uses_new_env_runners: bool = False, _return_metrics: bool = False, ) -> Union[List[SampleBatchType], SampleBatchType, List[EpisodeType], EpisodeType]: @@ -81,6 +82,8 @@ def synchronous_parallel_sample( sample_batches_or_episodes = [] all_stats_dicts = [] + random_action_kwargs = {} if not random_actions else {"random_actions": True} + # Stop collecting batches as soon as one criterium is met. while (max_agent_or_env_steps is None and agent_or_env_steps == 0) or ( max_agent_or_env_steps is not None @@ -89,16 +92,16 @@ def synchronous_parallel_sample( # No remote workers in the set -> Use local worker for collecting # samples. if worker_set.num_remote_workers() <= 0: - sampled_data = [worker_set.local_worker().sample()] + sampled_data = [worker_set.local_worker().sample(**random_action_kwargs)] if _return_metrics: stats_dicts = [worker_set.local_worker().get_metrics()] # Loop over remote workers' `sample()` method in parallel. else: sampled_data = worker_set.foreach_worker( ( - (lambda w: w.sample()) + (lambda w: w.sample(**random_action_kwargs)) if not _return_metrics - else (lambda w: (w.sample(), w.get_metrics())) + else (lambda w: (w.sample(**random_action_kwargs), w.get_metrics())) ), local_worker=False, timeout_seconds=sample_timeout_s, diff --git a/rllib/tuned_examples/dreamerv3/atari_100k.py b/rllib/tuned_examples/dreamerv3/atari_100k.py index 23a46fcbf3e7..68bdc9745136 100644 --- a/rllib/tuned_examples/dreamerv3/atari_100k.py +++ b/rllib/tuned_examples/dreamerv3/atari_100k.py @@ -9,17 +9,27 @@ """ # Run with: -# python run_regression_tests.py --dir [this file] --env ALE/[gym ID e.g. Pong-v5] +# python [this script name].py --env ALE/[gym ID e.g. Pong-v5] -from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config +# To see all available options: +# python [this script name].py --help +from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config +from ray.rllib.utils.test_utils import add_rllib_example_script_args -# Number of GPUs to run on. -num_gpus = 1 +parser = add_rllib_example_script_args( + default_iters=1000000, + default_reward=20.0, + default_timesteps=1000000, +) +# Use `parser` to add your own custom command line options to this script +# and (if needed) use their values toset up `config` below. +args = parser.parse_args() config = ( DreamerV3Config() .environment( + env=args.env, # [2]: "We follow the evaluation protocol of Machado et al. (2018) with 200M # environment steps, action repeat of 4, a time limit of 108,000 steps per # episode that correspond to 30 minutes of game play, no access to life @@ -34,31 +44,35 @@ "full_action_space": False, # Already done by MaxAndSkip wrapper: "action repeat" == 4. "frameskip": 1, - } - ) - .resources( - num_cpus_for_main_process=1, - ) - .learners( - num_learners=0 if num_gpus == 1 else num_gpus, - num_gpus_per_learner=1 if num_gpus else 0, + }, ) .env_runners( + num_env_runners=(args.num_env_runners or 0), # If we use >1 GPU and increase the batch size accordingly, we should also # increase the number of envs per worker. - num_envs_per_env_runner=(num_gpus or 1), - remote_worker_envs=True, + num_envs_per_env_runner=(args.num_gpus or 1), + remote_worker_envs=(args.num_gpus > 1), + ) + .learners( + num_learners=0 if args.num_gpus == 1 else args.num_gpus, + num_gpus_per_learner=1 if args.num_gpus else 0, ) .reporting( - metrics_num_episodes_for_smoothing=(num_gpus or 1), - report_images_and_videos=False, - report_dream_data=False, + metrics_num_episodes_for_smoothing=(args.num_gpus or 1), + report_images_and_videos=True, + report_dream_data=True, report_individual_batch_item_stats=False, ) # See Appendix A. .training( model_size="S", training_ratio=1024, - batch_size_B=16 * (num_gpus or 1), + batch_size_B=16 * (args.num_gpus or 1), ) ) + + +if __name__ == "__main__": + from ray.rllib.utils.test_utils import run_rllib_example_script_experiment + + run_rllib_example_script_experiment(config, args, stop={}, keep_config=True) diff --git a/rllib/tuned_examples/dreamerv3/atari_200M.py b/rllib/tuned_examples/dreamerv3/atari_200M.py index 2fb1e48b0929..2339d345d2f8 100644 --- a/rllib/tuned_examples/dreamerv3/atari_200M.py +++ b/rllib/tuned_examples/dreamerv3/atari_200M.py @@ -9,13 +9,22 @@ """ # Run with: -# python run_regression_tests.py --dir [this file] --env ALE/[gym ID e.g. Pong-v5] +# python [this script name].py --env ALE/[gym ID e.g. Pong-v5] -from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config +# To see all available options: +# python [this script name].py --help +from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config +from ray.rllib.utils.test_utils import add_rllib_example_script_args -# Number of GPUs to run on. -num_gpus = 1 +parser = add_rllib_example_script_args( + default_iters=1000000, + default_reward=20.0, + default_timesteps=1000000, +) +# Use `parser` to add your own custom command line options to this script +# and (if needed) use their values toset up `config` below. +args = parser.parse_args() config = ( DreamerV3Config() @@ -23,19 +32,10 @@ # For each (parallelized) env, we should provide a CPU. Lower this number # if you don't have enough CPUs. num_cpus_for_main_process=8 - * (num_gpus or 1), - ) - .learners( - num_learners=0 if num_gpus == 1 else num_gpus, - num_gpus_per_learner=1 if num_gpus else 0, - ) - .env_runners( - # If we use >1 GPU and increase the batch size accordingly, we should also - # increase the number of envs per worker. - num_envs_per_env_runner=8 * (num_gpus or 1), - remote_worker_envs=True, + * (args.num_gpus or 1), ) .environment( + env=args.env, # [2]: "We follow the evaluation protocol of Machado et al. (2018) with 200M # environment steps, action repeat of 4, a time limit of 108,000 steps per # episode that correspond to 30 minutes of game play, no access to life @@ -50,10 +50,21 @@ "full_action_space": False, # Already done by MaxAndSkip wrapper: "action repeat" == 4. "frameskip": 1, - } + }, + ) + .env_runners( + num_env_runners=(args.num_env_runners or 0), + # If we use >1 GPU and increase the batch size accordingly, we should also + # increase the number of envs per worker. + num_envs_per_env_runner=8 * (args.num_gpus or 1), + remote_worker_envs=True, + ) + .learners( + num_learners=0 if args.num_gpus == 1 else args.num_gpus, + num_gpus_per_learner=1 if args.num_gpus else 0, ) .reporting( - metrics_num_episodes_for_smoothing=(num_gpus or 1), + metrics_num_episodes_for_smoothing=(args.num_gpus or 1), report_images_and_videos=False, report_dream_data=False, report_individual_batch_item_stats=False, @@ -62,6 +73,12 @@ .training( model_size="XL", training_ratio=64, - batch_size_B=16 * (num_gpus or 1), + batch_size_B=16 * (args.num_gpus or 1), ) ) + + +if __name__ == "__main__": + from ray.rllib.utils.test_utils import run_rllib_example_script_experiment + + run_rllib_example_script_experiment(config, args, keep_config=True) diff --git a/rllib/tuned_examples/dreamerv3/dm_control_suite_vision.py b/rllib/tuned_examples/dreamerv3/dm_control_suite_vision.py index b201900da5f6..21c1a435a034 100644 --- a/rllib/tuned_examples/dreamerv3/dm_control_suite_vision.py +++ b/rllib/tuned_examples/dreamerv3/dm_control_suite_vision.py @@ -7,30 +7,45 @@ D. Hafner, T. Lillicrap, M. Norouzi, J. Ba https://arxiv.org/pdf/2010.02193.pdf """ + # Run with: -# python run_regression_tests.py --dir [this file] --env DMC/[task]/[domain] -# e.g. --env=DMC/cartpole/swingup +# python [this script name].py --env DMC/[task]/[domain] (e.g. DMC/cartpole/swingup) -from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config +# To see all available options: +# python [this script name].py --help +from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config +from ray.rllib.utils.test_utils import add_rllib_example_script_args -# Number of GPUs to run on. -num_gpus = 1 +parser = add_rllib_example_script_args( + default_iters=1000000, + default_reward=800.0, + default_timesteps=1000000, +) +# Use `parser` to add your own custom command line options to this script +# and (if needed) use their values toset up `config` below. +args = parser.parse_args() config = ( DreamerV3Config() # Use image observations. - .environment(env_config={"from_pixels": True}) - .resources( - num_cpus_for_main_process=1, + .environment( + env=args.env, + env_config={"from_pixels": True}, ) .learners( - num_learners=0 if num_gpus == 1 else num_gpus, - num_gpus_per_learner=1 if num_gpus else 0, + num_learners=0 if args.num_gpus == 1 else args.num_gpus, + num_gpus_per_learner=1 if args.num_gpus else 0, + ) + .env_runners( + num_env_runners=(args.num_env_runners or 0), + # If we use >1 GPU and increase the batch size accordingly, we should also + # increase the number of envs per worker. + num_envs_per_env_runner=4 * (args.num_gpus or 1), + remote_worker_envs=True, ) - .env_runners(num_envs_per_env_runner=4 * (num_gpus or 1), remote_worker_envs=True) .reporting( - metrics_num_episodes_for_smoothing=(num_gpus or 1), + metrics_num_episodes_for_smoothing=(args.num_gpus or 1), report_images_and_videos=False, report_dream_data=False, report_individual_batch_item_stats=False, @@ -39,6 +54,6 @@ .training( model_size="S", training_ratio=512, - batch_size_B=16 * (num_gpus or 1), + batch_size_B=16 * (args.num_gpus or 1), ) ) diff --git a/rllib/utils/metrics/__init__.py b/rllib/utils/metrics/__init__.py index 39e087da9434..764edabdb8a2 100644 --- a/rllib/utils/metrics/__init__.py +++ b/rllib/utils/metrics/__init__.py @@ -1,6 +1,7 @@ # Algorithm ResultDict keys. EVALUATION_RESULTS = "evaluation" ENV_RUNNER_RESULTS = "env_runners" +REPLAY_BUFFER_RESULTS = "replay_buffer" LEARNER_RESULTS = "learners" FAULT_TOLERANCE_STATS = "fault_tolerance" TIMERS = "timers" diff --git a/rllib/utils/metrics/metrics_logger.py b/rllib/utils/metrics/metrics_logger.py index 8de81deb0038..611d6f7d9b8c 100644 --- a/rllib/utils/metrics/metrics_logger.py +++ b/rllib/utils/metrics/metrics_logger.py @@ -596,9 +596,9 @@ def deactivate_tensor_mode(self): def tensors_to_numpy(self, tensor_metrics): """Converts all previously logged and returned tensors back to numpy values.""" - for key, value in tensor_metrics.items(): + for key, values in tensor_metrics.items(): assert self._key_in_stats(key) - self._get_key(key).numpy(value) + self._get_key(key).set_to_numpy_values(values) @property def tensor_mode(self): @@ -727,6 +727,19 @@ def set_value( clear_on_reduce=clear_on_reduce, ) + def delete(self, *key: Tuple[str], key_error: bool = True) -> None: + """Deletes th egiven `key` from this metrics logger's stats. + + Args: + key: The key or key sequence (for nested location within self.stats), + to delete from this MetricsLogger's stats. + key_error: Whether to throw a KeyError if `key` cannot be found in `self`. + + Raises: + KeyError: If `key` cannot be found in `self` AND `key_error` is True. + """ + self._del_key(key, key_error) + def reduce( self, key: Optional[Union[str, Tuple[str]]] = None, @@ -894,6 +907,19 @@ def _set_key(self, flat_key, stats): _dict[key] = {} _dict = _dict[key] + def _del_key(self, flat_key, key_error=False): + flat_key = force_tuple(tree.flatten(flat_key)) + _dict = self.stats + try: + for i, key in enumerate(flat_key): + if i == len(flat_key) - 1: + del _dict[key] + return + _dict = _dict[key] + except KeyError as e: + if key_error: + raise e + @Deprecated(new="MetricsLogger.merge_and_log_n_dicts()", error=True) def log_n_dicts(self, *args, **kwargs): pass diff --git a/rllib/utils/metrics/stats.py b/rllib/utils/metrics/stats.py index eec5845fd1a9..5a7358fde45e 100644 --- a/rllib/utils/metrics/stats.py +++ b/rllib/utils/metrics/stats.py @@ -472,18 +472,19 @@ def merge_in_parallel(self, *others: "Stats") -> None: self.values = list(reversed(new_values)) - def numpy(self, value: Any = None) -> "Stats": - """Converts all of self's internal values to numpy (if a tensor).""" - if value is not None: - if self._reduce_method is None: - assert isinstance(value, list) and len(self.values) >= len(value) - self.values = convert_to_numpy(value) - else: - assert len(self.values) > 0 - self.values = [convert_to_numpy(value)] + def set_to_numpy_values(self, values) -> None: + """Converts `self.values` from tensors to actual numpy values. + + Args: + values: The (numpy) values to set `self.values` to. + """ + numpy_values = convert_to_numpy(values) + if self._reduce_method is None: + assert isinstance(values, list) and len(self.values) >= len(values) + self.values = numpy_values else: - self.values = convert_to_numpy(self.values) - return self + assert len(self.values) > 0 + self.values = [numpy_values] def __len__(self) -> int: """Returns the length of the internal values list.""" @@ -613,8 +614,21 @@ def _reduced_values(self, values=None, window=None) -> Tuple[Any, Any]: reduce_in = reduce_in.float() reduced = reduce_meth(reduce_in) elif tf and tf.is_tensor(values[0]): - reduce_meth = getattr(tf, "reduce_" + self._reduce_method) - reduced = reduce_meth(values) + # TODO (sven): Currently, tensor metrics only work with window=1. + # We might want o enforce it more formally, b/c it's probably not a + # good idea to have MetricsLogger or Stats tinker with the actual + # computation graph that users are trying to build in their loss + # functions. + assert len(values) == 1 + # TODO (sven) If the shape is (), do NOT even use the reduce method. + # Using `tf.reduce_mean()` here actually lead to a completely broken + # DreamerV3 (for a still unknown exact reason). + if len(values[0].shape) == 0: + reduced = values[0] + else: + reduce_meth = getattr(tf, "reduce_" + self._reduce_method) + reduced = reduce_meth(values) + else: reduce_meth = getattr(np, "nan" + self._reduce_method) reduced = reduce_meth(values) From c94d8c3d2b2e8324005110023188837a07bb9651 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen <128072568+can-anyscale@users.noreply.github.com> Date: Fri, 31 May 2024 11:33:31 -0700 Subject: [PATCH 61/65] [ci][microcheck] add a few cheap tests to microcheck (#45657) Add keys to a few cheap builds and tests that I noticed failed on people's PR so we can include them in microcheck. These tests are not covered in the scope of test_in_docker. Test: - CI Signed-off-by: can --- .buildkite/build.rayci.yml | 1 + .buildkite/kuberay.rayci.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.buildkite/build.rayci.yml b/.buildkite/build.rayci.yml index 06e96da332f8..582f0d497b87 100644 --- a/.buildkite/build.rayci.yml +++ b/.buildkite/build.rayci.yml @@ -28,6 +28,7 @@ steps: - forge - label: ":tapioca: build: jar" + key: java_wheels tags: - java - oss diff --git a/.buildkite/kuberay.rayci.yml b/.buildkite/kuberay.rayci.yml index cbdbab5219ee..655a2a573f64 100644 --- a/.buildkite/kuberay.rayci.yml +++ b/.buildkite/kuberay.rayci.yml @@ -20,6 +20,7 @@ steps: - raycpubase - label: ":kubernetes: chaos {{matrix.workload}} under {{matrix.fault}}" + key: kuberay_tests tags: - python - docker From a30630a7451d9cc458f1973def33349468120608 Mon Sep 17 00:00:00 2001 From: Jiajun Yao Date: Fri, 31 May 2024 13:20:26 -0700 Subject: [PATCH 62/65] [Core] Ray c++ backend structured logging (#44468) Signed-off-by: Jiajun Yao --- src/ray/gcs/gcs_server/gcs_actor_manager.cc | 6 +- src/ray/raylet/node_manager.cc | 2 +- src/ray/util/logging.cc | 186 ++++++++++---------- src/ray/util/logging.h | 125 ++++++++----- src/ray/util/tests/logging_test.cc | 62 ++++++- 5 files changed, 237 insertions(+), 144 deletions(-) diff --git a/src/ray/gcs/gcs_server/gcs_actor_manager.cc b/src/ray/gcs/gcs_server/gcs_actor_manager.cc index 33183a0489a4..008b6c13e3a2 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_actor_manager.cc @@ -259,8 +259,10 @@ void GcsActorManager::HandleRegisterActor(rpc::RegisterActorRequest request, auto actor_id = ActorID::FromBinary(request.task_spec().actor_creation_task_spec().actor_id()); - RAY_LOG(INFO) << "Registering actor, job id = " << actor_id.JobId() - << ", actor id = " << actor_id; + RAY_LOG(INFO) + .WithField(kLogKeyJobID, actor_id.JobId()) + .WithField(kLogKeyActorID, actor_id) + << "Registering actor"; Status status = RegisterActor(request, [reply, send_reply_callback, actor_id]( diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index a4be13e6b533..29099231237d 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -284,7 +284,7 @@ NodeManager::NodeManager( RayConfig::instance().min_memory_free_bytes(), RayConfig::instance().memory_monitor_refresh_ms(), CreateMemoryUsageRefreshCallback())) { - RAY_LOG(INFO) << "Initializing NodeManager with ID " << self_node_id_; + RAY_LOG(INFO).WithField(kLogKeyNodeID, self_node_id_) << "Initializing NodeManager"; cluster_resource_scheduler_ = std::make_shared( io_service, scheduling::NodeID(self_node_id_.Binary()), diff --git a/src/ray/util/logging.cc b/src/ray/util/logging.cc index fdcf5d22b56c..5e3d55e46e79 100644 --- a/src/ray/util/logging.cc +++ b/src/ray/util/logging.cc @@ -36,6 +36,7 @@ #include "absl/debugging/stacktrace.h" #include "absl/debugging/symbolize.h" #include "absl/strings/str_format.h" +#include "nlohmann/json.hpp" #include "ray/util/event_label.h" #include "ray/util/filesystem.h" #include "ray/util/util.h" @@ -46,12 +47,19 @@ namespace ray { +// Format pattern is 2020-08-21 17:00:00,000 I 100 1001 msg. +// %L is loglevel, %P is process id, %t for thread id. +constexpr char kLogFormatTextPattern[] = "[%Y-%m-%d %H:%M:%S,%e %L %P %t] %v"; +constexpr char kLogFormatJsonPattern[] = + "{\"asctime\":\"%Y-%m-%d %H:%M:%S,%e\",\"levelname\":\"%L\"%v}"; + RayLogLevel RayLog::severity_threshold_ = RayLogLevel::INFO; std::string RayLog::app_name_ = ""; +std::string RayLog::component_name_ = ""; std::string RayLog::log_dir_ = ""; -// Format pattern is 2020-08-21 17:00:00,000 I 100 1001 msg. -// %L is loglevel, %P is process id, %t for thread id. -std::string RayLog::log_format_pattern_ = "[%Y-%m-%d %H:%M:%S,%e %L %P %t] %v"; +bool RayLog::log_format_json_ = false; +std::string RayLog::log_format_pattern_ = kLogFormatTextPattern; + std::string RayLog::logger_name_ = "ray_log_sink"; long RayLog::log_rotation_max_size_ = 1 << 29; long RayLog::log_rotation_file_num_ = 10; @@ -141,51 +149,8 @@ class DefaultStdErrLogger final { std::shared_ptr default_stderr_logger_; }; -class SpdLogMessage final { - public: - explicit SpdLogMessage(const char *file, - int line, - int loglevel, - std::shared_ptr expose_osstream) - : loglevel_(loglevel), expose_osstream_(expose_osstream) { - stream() << ConstBasename(file) << ":" << line << ": "; - } - - inline void Flush() { - auto logger = spdlog::get(RayLog::GetLoggerName()); - if (!logger) { - logger = DefaultStdErrLogger::Instance().GetDefaultLogger(); - } - - if (loglevel_ == static_cast(spdlog::level::critical)) { - stream() << "\n*** StackTrace Information ***\n" << ray::StackTrace(); - } - if (expose_osstream_) { - *expose_osstream_ << "\n*** StackTrace Information ***\n" << ray::StackTrace(); - } - // NOTE(lingxuan.zlx): See more fmt by visiting https://github.com/fmtlib/fmt. - logger->log( - static_cast(loglevel_), /*fmt*/ "{}", str_.str()); - logger->flush(); - } - - ~SpdLogMessage() { Flush(); } - inline std::ostream &stream() { return str_; } - - private: - SpdLogMessage(const SpdLogMessage &) = delete; - SpdLogMessage &operator=(const SpdLogMessage &) = delete; - - private: - std::ostringstream str_; - int loglevel_; - std::shared_ptr expose_osstream_; -}; - -typedef ray::SpdLogMessage LoggingProvider; - // Spdlog's severity map. -static int GetMappedSeverity(RayLogLevel severity) { +static spdlog::level::level_enum GetMappedSeverity(RayLogLevel severity) { switch (severity) { case RayLogLevel::TRACE: return spdlog::level::trace; @@ -208,9 +173,7 @@ static int GetMappedSeverity(RayLogLevel severity) { std::vector RayLog::fatal_log_callbacks_; -void RayLog::StartRayLog(const std::string &app_name, - RayLogLevel severity_threshold, - const std::string &log_dir) { +void RayLog::InitSeverityThreshold(RayLogLevel severity_threshold) { const char *var_value = std::getenv("RAY_BACKEND_LOG_LEVEL"); if (var_value != nullptr) { std::string data = var_value; @@ -234,12 +197,35 @@ void RayLog::StartRayLog(const std::string &app_name, << " to " << static_cast(severity_threshold); } severity_threshold_ = severity_threshold; +} + +void RayLog::InitLogFormat() { + // Default is plain text + log_format_json_ = false; + log_format_pattern_ = kLogFormatTextPattern; + + const char *var_value = std::getenv("RAY_BACKEND_LOG_JSON"); + if (var_value != nullptr) { + std::string data = var_value; + if (data == "1") { + log_format_json_ = true; + log_format_pattern_ = kLogFormatJsonPattern; + } + } +} + +void RayLog::StartRayLog(const std::string &app_name, + RayLogLevel severity_threshold, + const std::string &log_dir) { + InitSeverityThreshold(severity_threshold); + InitLogFormat(); + app_name_ = app_name; log_dir_ = log_dir; // All the logging sinks to add. std::vector sinks; - auto level = static_cast(severity_threshold_); + auto level = GetMappedSeverity(severity_threshold_); std::string app_name_without_path = app_name; if (app_name.empty()) { app_name_without_path = "DefaultApp"; @@ -274,8 +260,6 @@ void RayLog::StartRayLog(const std::string &app_name, log_rotation_file_num_ = file_num; } } - spdlog::set_pattern(log_format_pattern_); - spdlog::set_level(static_cast(severity_threshold_)); // Sink all log stuff to default file logger we defined here. We may need // multiple sinks for different files or loglevel. auto file_logger = spdlog::get(RayLog::GetLoggerName()); @@ -288,14 +272,11 @@ void RayLog::StartRayLog(const std::string &app_name, JoinPaths(log_dir_, app_name_without_path + "_" + std::to_string(pid) + ".log"), log_rotation_max_size_, log_rotation_file_num_); + file_sink->set_level(level); sinks.push_back(file_sink); } else { - // Format pattern is 2020-08-21 17:00:00,000 I 100 1001 msg. - // %L is loglevel, %P is process id, %t for thread id. - log_format_pattern_ = - "[%Y-%m-%d %H:%M:%S,%e %L %P %t] (" + app_name_without_path + ") %v"; + component_name_ = app_name_without_path; auto console_sink = std::make_shared(); - console_sink->set_pattern(log_format_pattern_); console_sink->set_level(level); sinks.push_back(console_sink); } @@ -303,7 +284,6 @@ void RayLog::StartRayLog(const std::string &app_name, // In all cases, log errors to the console log so they are in driver logs. // https://github.com/ray-project/ray/issues/12893 auto err_sink = std::make_shared(); - err_sink->set_pattern(log_format_pattern_); err_sink->set_level(spdlog::level::err); sinks.push_back(err_sink); @@ -311,9 +291,8 @@ void RayLog::StartRayLog(const std::string &app_name, auto logger = std::make_shared( RayLog::GetLoggerName(), sinks.begin(), sinks.end()); logger->set_level(level); + // Set the pattern of all sinks. logger->set_pattern(log_format_pattern_); - spdlog::set_level(static_cast(severity_threshold_)); - spdlog::set_pattern(log_format_pattern_); spdlog::set_default_logger(logger); initialized_ = true; @@ -418,58 +397,87 @@ void RayLog::AddFatalLogCallbacks( } RayLog::RayLog(const char *file_name, int line_number, RayLogLevel severity) - : logging_provider_(nullptr), - is_enabled_(severity >= severity_threshold_), + : is_enabled_(severity >= severity_threshold_), severity_(severity), is_fatal_(severity == RayLogLevel::FATAL) { if (is_fatal_) { - expose_osstream_ = std::make_shared(); - #ifdef _WIN32 int pid = _getpid(); #else pid_t pid = getpid(); #endif - *expose_osstream_ << absl::StrFormat("%s:%d (PID: %d, TID: %s, errno: %d (%s)):", - file_name, - line_number, - pid, - std::to_string(GetTid()), - errno, - strerror(errno)); + expose_fatal_osstream_ << absl::StrFormat("%s:%d (PID: %d, TID: %s, errno: %d (%s)):", + file_name, + line_number, + pid, + std::to_string(GetTid()), + errno, + strerror(errno)); } if (is_enabled_) { - logging_provider_ = new LoggingProvider( - file_name, line_number, GetMappedSeverity(severity), expose_osstream_); + if (log_format_json_) { + if (!component_name_.empty()) { + WithField(kLogKeyComponent, component_name_); + } + WithField(kLogKeyFilename, ConstBasename(file_name)); + WithField(kLogKeyLineno, line_number); + } else { + if (!component_name_.empty()) { + msg_osstream_ << "(" << component_name_ << ") "; + } + msg_osstream_ << ConstBasename(file_name) << ":" << line_number << ": "; + } } } -std::ostream &RayLog::Stream() { - auto logging_provider = reinterpret_cast(logging_provider_); - // Before calling this function, user should check IsEnabled. - // When IsEnabled == false, logging_provider_ will be empty. - return logging_provider->stream(); -} - bool RayLog::IsEnabled() const { return is_enabled_; } bool RayLog::IsFatal() const { return is_fatal_; } -std::ostream &RayLog::ExposeStream() { return *expose_osstream_; } - RayLog::~RayLog() { - if (logging_provider_ != nullptr) { - delete reinterpret_cast(logging_provider_); - logging_provider_ = nullptr; - } - if (expose_osstream_ != nullptr) { + if (IsFatal()) { + msg_osstream_ << "\n*** StackTrace Information ***\n" << ray::StackTrace(); + expose_fatal_osstream_ << "\n*** StackTrace Information ***\n" << ray::StackTrace(); for (const auto &callback : fatal_log_callbacks_) { - callback(EL_RAY_FATAL_CHECK_FAILED, expose_osstream_->str()); + callback(EL_RAY_FATAL_CHECK_FAILED, expose_fatal_osstream_.str()); } } + + auto logger = spdlog::get(RayLog::GetLoggerName()); + if (!logger) { + logger = DefaultStdErrLogger::Instance().GetDefaultLogger(); + } + // NOTE(lingxuan.zlx): See more fmt by visiting https://github.com/fmtlib/fmt. + if (log_format_json_) { + logger->log(GetMappedSeverity(severity_), + /*fmt*/ ",\"{}\":{}{}", + kLogKeyMessage, + nlohmann::json(msg_osstream_.str()).dump(), + context_osstream_.str()); + } else { + logger->log(GetMappedSeverity(severity_), + /*fmt*/ "{}{}", + msg_osstream_.str(), + context_osstream_.str()); + } + logger->flush(); + if (severity_ == RayLogLevel::FATAL) { std::_Exit(EXIT_FAILURE); } } +template <> +RayLog &RayLog::WithFieldJsonFormat(std::string_view key, + const std::string &value) { + context_osstream_ << ",\"" << key << "\":" << nlohmann::json(value).dump(); + return *this; +} + +template <> +RayLog &RayLog::WithFieldJsonFormat(std::string_view key, const int &value) { + context_osstream_ << ",\"" << key << "\":" << value; + return *this; +} + } // namespace ray diff --git a/src/ray/util/logging.h b/src/ray/util/logging.h index 2adf4b26f617..16abe4272509 100644 --- a/src/ray/util/logging.h +++ b/src/ray/util/logging.h @@ -85,6 +85,19 @@ enum { ERROR = 0 }; #endif namespace ray { +/// Sync with ray._private.structured_logging.constants.LogKey +constexpr std::string_view kLogKeyAsctime = "asctime"; +constexpr std::string_view kLogKeyLevelname = "levelname"; +constexpr std::string_view kLogKeyMessage = "message"; +constexpr std::string_view kLogKeyFilename = "filename"; +constexpr std::string_view kLogKeyLineno = "lineno"; +constexpr std::string_view kLogKeyComponent = "component"; +constexpr std::string_view kLogKeyJobID = "job_id"; +constexpr std::string_view kLogKeyWorkerID = "worker_id"; +constexpr std::string_view kLogKeyNodeID = "node_id"; +constexpr std::string_view kLogKeyActorID = "actor_id"; +constexpr std::string_view kLogKeyTaskID = "task_id"; + class StackTrace { /// This dumps the current stack trace information. friend std::ostream &operator<<(std::ostream &os, const StackTrace &stack_trace); @@ -208,50 +221,24 @@ enum class RayLogLevel { // which hide the implementation into logging.cc file. // In logging.cc, we can choose different log libs using different macros. -// This is also a null log which does not output anything. -class RayLogBase { - public: - virtual ~RayLogBase(){}; - - // By default, this class is a null log because it return false here. - virtual bool IsEnabled() const { return false; }; - - // This function to judge whether current log is fatal or not. - virtual bool IsFatal() const { return false; }; - - template - RayLogBase &operator<<(const T &t) { - if (IsEnabled()) { - Stream() << t; - } - if (IsFatal()) { - ExposeStream() << t; - } - return *this; - } - - protected: - virtual std::ostream &Stream() { return std::cerr; }; - virtual std::ostream &ExposeStream() { return std::cerr; }; -}; - /// Callback function which will be triggered to expose fatal log. /// The first argument: a string representing log type or label. /// The second argument: log content. using FatalLogCallback = std::function; -class RayLog : public RayLogBase { +class RayLog { public: RayLog(const char *file_name, int line_number, RayLogLevel severity); - virtual ~RayLog(); + ~RayLog(); /// Return whether or not current logging instance is enabled. /// /// \return True if logging is enabled and false otherwise. - virtual bool IsEnabled() const; + bool IsEnabled() const; - virtual bool IsFatal() const; + /// This function to judge whether current log is fatal or not. + bool IsFatal() const; /// The init function of ray log for a program which should be called only once. /// @@ -296,9 +283,6 @@ class RayLog : public RayLogBase { /// To check failure signal handler enabled or not. static bool IsFailureSignalHandlerEnabled(); - /// Get the log level from environment variable. - static RayLogLevel GetLogLevelFromEnv(); - static std::string GetLogFormatPattern(); static std::string GetLoggerName(); @@ -307,35 +291,81 @@ class RayLog : public RayLogBase { static void AddFatalLogCallbacks( const std::vector &expose_log_callbacks); + template + RayLog &operator<<(const T &t) { + if (IsEnabled()) { + msg_osstream_ << t; + } + if (IsFatal()) { + expose_fatal_osstream_ << t; + } + return *this; + } + + /// Add log context to the log. + /// Caller should make sure key is not duplicated + /// and doesn't conflict with system keys like levelname. + template + RayLog &WithField(std::string_view key, const T &value) { + if (log_format_json_) { + return WithFieldJsonFormat(key, value); + } else { + return WithFieldTextFormat(key, value); + } + } + private: FRIEND_TEST(PrintLogTest, TestRayLogEveryNOrDebug); FRIEND_TEST(PrintLogTest, TestRayLogEveryN); - // Hide the implementation of log provider by void *. - // Otherwise, lib user may define the same macro to use the correct header file. - void *logging_provider_; + + template + RayLog &WithFieldTextFormat(std::string_view key, const T &value) { + context_osstream_ << " " << key << "=" << value; + return *this; + } + + template + RayLog &WithFieldJsonFormat(std::string_view key, const T &value) { + std::stringstream ss; + ss << value; + return WithFieldJsonFormat(key, ss.str()); + } + + static void InitSeverityThreshold(RayLogLevel severity_threshold); + static void InitLogFormat(); + /// True if log messages should be logged and false if they should be ignored. bool is_enabled_; /// log level. RayLogLevel severity_; /// Whether current log is fatal or not. bool is_fatal_ = false; - /// String stream of exposed log content. - std::shared_ptr expose_osstream_ = nullptr; + /// String stream of the log message + std::ostringstream msg_osstream_; + /// String stream of the log context: a list of key-value pairs. + std::ostringstream context_osstream_; + /// String stream of exposed fatal log content. + std::ostringstream expose_fatal_osstream_; + /// Whether or not the log is initialized. static std::atomic initialized_; /// Callback functions which will be triggered to expose fatal log. static std::vector fatal_log_callbacks_; static RayLogLevel severity_threshold_; - // In InitGoogleLogging, it simply keeps the pointer. - // We need to make sure the app name passed to InitGoogleLogging exist. static std::string app_name_; + /// This is used when we log to stderr + /// to indicate which component generates the log. + /// This is empty if we log to file. + static std::string component_name_; /// The directory where the log files are stored. /// If this is empty, logs are printed to stdout. static std::string log_dir_; /// This flag is used to avoid calling UninstallSignalAction in ShutDownRayLog if /// InstallFailureSignalHandler was not called. static bool is_failure_signal_handler_installed_; - // Log format content. + /// Whether emit json logs. + static bool log_format_json_; + // Log format pattern. static std::string log_format_pattern_; // Log rotation file size limitation. static long log_rotation_max_size_; @@ -345,17 +375,22 @@ class RayLog : public RayLogBase { static std::string logger_name_; protected: - virtual std::ostream &Stream(); - virtual std::ostream &ExposeStream(); + virtual std::ostream &Stream() { return msg_osstream_; } }; +template <> +RayLog &RayLog::WithFieldJsonFormat(std::string_view key, + const std::string &value); +template <> +RayLog &RayLog::WithFieldJsonFormat(std::string_view key, const int &value); + // This class make RAY_CHECK compilation pass to change the << operator to void. class Voidify { public: Voidify() {} // This has to be an operator with a precedence lower than << but // higher than ?: - void operator&(RayLogBase &) {} + void operator&(RayLog &) {} }; } // namespace ray diff --git a/src/ray/util/tests/logging_test.cc b/src/ray/util/tests/logging_test.cc index 965e443eb466..b09197b64a0f 100644 --- a/src/ray/util/tests/logging_test.cc +++ b/src/ray/util/tests/logging_test.cc @@ -16,24 +16,22 @@ #include #include +#include #include #include "absl/strings/str_format.h" +#include "absl/strings/str_split.h" #include "gmock/gmock.h" #include "gtest/gtest.h" +#include "nlohmann/json.hpp" #include "ray/util/filesystem.h" +#include "ray/util/util.h" using namespace testing; +using json = nlohmann::json; namespace ray { -int64_t current_time_ms() { - std::chrono::milliseconds ms_since_epoch = - std::chrono::duration_cast( - std::chrono::steady_clock::now().time_since_epoch()); - return ms_since_epoch.count(); -} - // This is not really test. // This file just print some information using the logging macro. @@ -61,7 +59,9 @@ TEST(PrintLogTest, LogTestWithoutInit) { #if GTEST_HAS_STREAM_REDIRECTION using testing::internal::CaptureStderr; +using testing::internal::CaptureStdout; using testing::internal::GetCapturedStderr; +using testing::internal::GetCapturedStdout; namespace { void VerifyOnlyNthOccurenceLogged(bool fallback_to_debug) { @@ -186,6 +186,54 @@ TEST(PrintLogTest, TestRayLogEveryMs) { EXPECT_LT(occurrences, 15); } +TEST(PrintLogTest, TestTextLogging) { + setenv("RAY_BACKEND_LOG_JSON", "0", true); + RayLog::StartRayLog("/tmp/gcs", RayLogLevel::INFO, ""); + CaptureStdout(); + RAY_LOG(INFO).WithField("key1", "value1").WithField("key2", "value2") + << "contextual log"; + + std::vector log_lines = + absl::StrSplit(GetCapturedStdout(), '\n', absl::SkipEmpty()); + ASSERT_EQ(1, log_lines.size()); + ASSERT_NE(log_lines[0].find("contextual log key1=value1 key2=value2"), + std::string::npos); + + RayLog::ShutDownRayLog(); + unsetenv("RAY_BACKEND_LOG_JSON"); +} + +TEST(PrintLogTest, TestJSONLogging) { + setenv("RAY_BACKEND_LOG_JSON", "1", true); + RayLog::StartRayLog("/tmp/raylet", RayLogLevel::INFO, ""); + CaptureStdout(); + RAY_LOG(DEBUG) << "this is not logged"; + RAY_LOG(INFO) << "this is info logged"; + RAY_LOG(WARNING) << "this needs\nescape\""; + RAY_LOG(INFO).WithField("key1", "value1").WithField("key2", "value\n2") + << "contextual log"; + + std::vector log_lines = + absl::StrSplit(GetCapturedStdout(), '\n', absl::SkipEmpty()); + ASSERT_EQ(3, log_lines.size()); + json log1 = json::parse(log_lines[0]); + json log2 = json::parse(log_lines[1]); + json log3 = json::parse(log_lines[2]); + ASSERT_EQ(log1[std::string(kLogKeyMessage)], "this is info logged"); + ASSERT_EQ(log2[std::string(kLogKeyMessage)], "this needs\nescape\""); + ASSERT_EQ(log3[std::string(kLogKeyMessage)], "contextual log"); + ASSERT_TRUE(log3.contains(kLogKeyAsctime)); + ASSERT_TRUE(log3.contains(kLogKeyFilename)); + ASSERT_TRUE(log3.contains(kLogKeyLineno)); + ASSERT_EQ(log3[std::string(kLogKeyLevelname)], "I"); + ASSERT_EQ(log3[std::string(kLogKeyComponent)], "raylet"); + ASSERT_EQ(log3["key1"], "value1"); + ASSERT_EQ(log3["key2"], "value\n2"); + + RayLog::ShutDownRayLog(); + unsetenv("RAY_BACKEND_LOG_JSON"); +} + #endif /* GTEST_HAS_STREAM_REDIRECTION */ TEST(PrintLogTest, LogTestWithInit) { From 3c9edf1b790dc65c36ee27d4032ab75a89e41930 Mon Sep 17 00:00:00 2001 From: Cheng Su Date: Fri, 31 May 2024 13:43:49 -0700 Subject: [PATCH 63/65] [Data] Record more telemetry for newly added datasources (#45647) This PR is to add the telemetry recording for newly added datasources. Signed-off-by: Cheng Su --- python/ray/data/_internal/logical/util.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/python/ray/data/_internal/logical/util.py b/python/ray/data/_internal/logical/util.py index 084bb2c67aff..af6f2420a269 100644 --- a/python/ray/data/_internal/logical/util.py +++ b/python/ray/data/_internal/logical/util.py @@ -28,8 +28,14 @@ "ReadNumpy", "ReadTFRecord", "ReadBinary", - "ReadCustom", "ReadTorch", + "ReadAvro", + "ReadWebDataset", + "ReadSQL", + "ReadDatabricksUC", + "ReadLance", + "ReadHuggingFace", + "ReadCustom", # From "FromArrow", "FromItems", @@ -43,6 +49,8 @@ "WriteTFRecord", "WriteNumpy", "WriteMongo", + "WriteWebDataset", + "WriteSQL", "WriteCustom", # Map "Map", From fe191e6c81178de48a404bb279233651aad8a63e Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Fri, 31 May 2024 14:06:59 -0700 Subject: [PATCH 64/65] [Core] Expose NodeDeathInfo in state CLI (#45644) Signed-off-by: Rui Qiao --- dashboard/datacenter.py | 3 ++- dashboard/state_aggregator.py | 5 +++++ dashboard/utils.py | 11 +++++++---- python/ray/tests/test_state_api.py | 18 +++++++++++------- python/ray/tests/test_task_events.py | 3 ++- python/ray/util/state/common.py | 3 +++ .../transport/direct_task_transport.cc | 17 ++++++++++------- 7 files changed, 40 insertions(+), 20 deletions(-) diff --git a/dashboard/datacenter.py b/dashboard/datacenter.py index 1e4627eadc86..f7702e64411e 100644 --- a/dashboard/datacenter.py +++ b/dashboard/datacenter.py @@ -138,8 +138,9 @@ async def get_node_info(cls, node_id, get_summary=False): # Merge GcsNodeInfo to node physical stats node_info["raylet"].update(node) + death_info = node.get("deathInfo", {}) node_info["raylet"]["stateMessage"] = compose_state_message( - node.get("deathInfo", {}) + death_info.get("reason", None), death_info.get("reasonMessage", None) ) if not get_summary: diff --git a/dashboard/state_aggregator.py b/dashboard/state_aggregator.py index e3aa7a9f8c6a..5c8591484d32 100644 --- a/dashboard/state_aggregator.py +++ b/dashboard/state_aggregator.py @@ -11,6 +11,7 @@ from ray._private.profiling import chrome_tracing_dump import ray.dashboard.memory_utils as memory_utils +from ray.dashboard.utils import compose_state_message from ray.util.state.common import ( protobuf_message_to_dict, @@ -323,6 +324,10 @@ async def list_nodes(self, *, option: ListApiOptions) -> ListApiResponse: data["node_ip"] = data["node_manager_address"] data["start_time_ms"] = int(data["start_time_ms"]) data["end_time_ms"] = int(data["end_time_ms"]) + death_info = data.get("death_info", {}) + data["state_message"] = compose_state_message( + death_info.get("reason", None), death_info.get("reason_message", None) + ) result.append(data) diff --git a/dashboard/utils.py b/dashboard/utils.py index 5d5c4a0284c7..99c624803bbe 100644 --- a/dashboard/utils.py +++ b/dashboard/utils.py @@ -656,13 +656,17 @@ def get_address_for_submission_client(address: Optional[str]) -> str: return address -def compose_state_message(death_info_dict: dict) -> Optional[str]: +def compose_state_message( + death_reason: Optional[str], death_reason_message: Optional[str] +) -> Optional[str]: """Compose node state message based on death information. Args: - death_info_dict: the node_death field in GcsNodeInfo, in dict type. + death_reason: The reason of node death. + This is a string representation of `gcs_pb2.NodeDeathInfo.Reason`. + death_reason_message: The message of node death. + This corresponds to `gcs_pb2.NodeDeathInfo.ReasonMessage`. """ - death_reason = death_info_dict.get("reason", None) if death_reason == "EXPECTED_TERMINATION": state_message = "Expected termination" elif death_reason == "UNEXPECTED_TERMINATION": @@ -674,7 +678,6 @@ def compose_state_message(death_info_dict: dict) -> Optional[str]: else: state_message = None - death_reason_message = death_info_dict.get("reasonMessage", None) if death_reason_message: if state_message: state_message += f": {death_reason_message}" diff --git a/python/ray/tests/test_state_api.py b/python/ray/tests/test_state_api.py index b6555e1e9ab1..f9f582547f9c 100644 --- a/python/ray/tests/test_state_api.py +++ b/python/ray/tests/test_state_api.py @@ -2154,19 +2154,23 @@ def test_list_get_nodes(ray_start_cluster): cluster = ray_start_cluster cluster.add_node(num_cpus=1, node_name="head_node") ray.init(address=cluster.address) - cluster.add_node(num_cpus=1, node_name="worker_node") + worker_node = cluster.add_node(num_cpus=1, node_name="worker_node") + + cluster.remove_node(worker_node) def verify(): nodes = list_nodes(detail=True) for node in nodes: - assert node["state"] == "ALIVE" assert is_hex(node["node_id"]) - assert ( - node["is_head_node"] - if node["node_name"] == "head_node" - else not node["is_head_node"] - ) assert node["labels"] == {"ray.io/node_id": node["node_id"]} + if node["node_name"] == "head_node": + assert node["is_head_node"] + assert node["state"] == "ALIVE" + assert node["state_message"] is None + else: + assert not node["is_head_node"] + assert node["state"] == "DEAD" + assert node["state_message"] == "Expected termination: received SIGTERM" # Check with legacy API check_nodes = ray.nodes() diff --git a/python/ray/tests/test_task_events.py b/python/ray/tests/test_task_events.py index 89434049349d..b1210b59aff3 100644 --- a/python/ray/tests/test_task_events.py +++ b/python/ray/tests/test_task_events.py @@ -204,7 +204,8 @@ def driver_running(): verify_failed_task, name="node-killed", error_type="NODE_DIED", - error_message="Task failed due to the node dying", + error_message="Task failed due to the node (where this task was running) " + " was dead or unavailable", ) diff --git a/python/ray/util/state/common.py b/python/ray/util/state/common.py index 366648674ee4..9dbb56286c61 100644 --- a/python/ray/util/state/common.py +++ b/python/ray/util/state/common.py @@ -507,6 +507,9 @@ class NodeState(StateSchema): #: ALIVE: The node is alive. #: DEAD: The node is dead. state: TypeNodeStatus = state_column(filterable=True) + #: The state message of the node. + #: This provides more detailed information about the node's state. + state_message: Optional[str] = state_column(filterable=False) #: The name of the node if it is given by the name argument. node_name: str = state_column(filterable=True) #: The total resources of the node. diff --git a/src/ray/core_worker/transport/direct_task_transport.cc b/src/ray/core_worker/transport/direct_task_transport.cc index 54799a1acba0..b419fae2ffea 100644 --- a/src/ray/core_worker/transport/direct_task_transport.cc +++ b/src/ray/core_worker/transport/direct_task_transport.cc @@ -728,13 +728,16 @@ void CoreWorkerDirectTaskSubmitter::HandleGetTaskFailureCause( << " ip: " << addr.ip_address(); task_error_type = rpc::ErrorType::NODE_DIED; std::stringstream buffer; - buffer << "Task failed due to the node dying.\n\nThe node (IP: " << addr.ip_address() - << ", node ID: " << NodeID::FromBinary(addr.raylet_id()) - << ") where this task was running crashed unexpectedly. " - << "This can happen if: (1) the instance where the node was running failed, " - "(2) raylet crashes unexpectedly (OOM, preempted node, etc).\n\n" - << "To see more information about the crash, use `ray logs raylet.out -ip " - << addr.ip_address() << "`"; + buffer << "Task failed due to the node (where this task was running) " + << " was dead or unavailable.\n\nThe node IP: " << addr.ip_address() + << ", node ID: " << NodeID::FromBinary(addr.raylet_id()) << "\n\n" + << "This can happen if the instance where the node was running failed, " + << "the node was preempted, or raylet crashed unexpectedly " + << "(e.g., due to OOM) etc.\n\n" + << "To see node death information, use `ray list nodes --filter \"node_id=" + << NodeID::FromBinary(addr.raylet_id()) << "\"`, " + << "or check Ray dashboard cluster page, or search the node ID in GCS log, " + << "or use `ray logs raylet.out -ip " << addr.ip_address() << "`"; error_info = std::make_unique(); error_info->set_error_message(buffer.str()); error_info->set_error_type(rpc::ErrorType::NODE_DIED); From 7021b10356069cf424556f1a5683c5f270a87e5b Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Fri, 31 May 2024 21:27:39 -0700 Subject: [PATCH 65/65] [Core] Expose NodeDeathInfo in ActorDiedError (#45497) Signed-off-by: Rui Qiao --- dashboard/tests/test_dashboard.py | 4 +- python/ray/exceptions.py | 14 +++--- python/ray/tests/test_actor_failures.py | 2 +- python/ray/tests/test_draining.py | 18 +++++--- python/ray/tests/test_node_death.py | 43 +++++++++++++++++-- python/ray/tests/test_runtime_env_agent.py | 4 +- .../transport/direct_actor_task_submitter.cc | 13 +++--- src/ray/gcs/gcs_server/gcs_actor_manager.cc | 34 +++++++++++---- src/ray/protobuf/common.proto | 18 +++++++- src/ray/protobuf/gcs.proto | 14 ------ 10 files changed, 114 insertions(+), 50 deletions(-) diff --git a/dashboard/tests/test_dashboard.py b/dashboard/tests/test_dashboard.py index a5254179d948..1fc3ad1bb630 100644 --- a/dashboard/tests/test_dashboard.py +++ b/dashboard/tests/test_dashboard.py @@ -36,7 +36,7 @@ wait_until_server_available, wait_until_succeeded_without_exception, ) -from ray.core.generated import gcs_pb2 +from ray.core.generated import common_pb2 import ray.scripts.scripts as scripts from ray.dashboard import dashboard from ray.dashboard.head import DashboardHead @@ -219,7 +219,7 @@ def test_raylet_and_agent_share_fate(shutdown_only): node for node in ray.nodes() if node["NodeID"] == worker_node_id ][0] assert not worker_node_info["Alive"] - assert worker_node_info["DeathReason"] == gcs_pb2.NodeDeathInfo.Reason.Value( + assert worker_node_info["DeathReason"] == common_pb2.NodeDeathInfo.Reason.Value( "UNEXPECTED_TERMINATION" ) assert ( diff --git a/python/ray/exceptions.py b/python/ray/exceptions.py index e35eee260d95..14c308e20eac 100644 --- a/python/ray/exceptions.py +++ b/python/ray/exceptions.py @@ -13,6 +13,7 @@ ActorDiedErrorContext, Address, Language, + NodeDeathInfo, RayException, ) from ray.util.annotations import DeveloperAPI, PublicAPI @@ -339,7 +340,9 @@ class ActorDiedError(RayActorError): BASE_ERROR_MSG = "The actor died unexpectedly before finishing this task." - def __init__(self, cause: Union[RayTaskError, ActorDiedErrorContext] = None): + def __init__( + self, cause: Optional[Union[RayTaskError, ActorDiedErrorContext]] = None + ): """ Construct a RayActorError by building the arguments. """ @@ -380,11 +383,12 @@ def __init__(self, cause: Union[RayTaskError, ActorDiedErrorContext] = None): error_msg_lines.append( "The actor never ran - it was cancelled before it started running." ) - if cause.preempted: + if ( + cause.node_death_info + and cause.node_death_info.reason + == NodeDeathInfo.AUTOSCALER_DRAIN_PREEMPTED + ): preempted = True - error_msg_lines.append( - "\tThe actor's node was killed by a spot preemption." - ) error_msg = "\n".join(error_msg_lines) actor_id = ActorID(cause.actor_id).hex() super().__init__(actor_id, error_msg, actor_init_failed, preempted) diff --git a/python/ray/tests/test_actor_failures.py b/python/ray/tests/test_actor_failures.py index 86e14eaee994..975b942d5f9f 100644 --- a/python/ray/tests/test_actor_failures.py +++ b/python/ray/tests/test_actor_failures.py @@ -748,7 +748,7 @@ def create_actor(self): cluster.remove_node(node_to_kill) with pytest.raises( ray.exceptions.RayActorError, - match="The actor is dead because its node has died.", + match="The actor died because its node has died.", ) as exc_info: ray.get(a.check_alive.remote()) assert exc_info.value.actor_id == a._actor_id.hex() diff --git a/python/ray/tests/test_draining.py b/python/ray/tests/test_draining.py index 360e5d5d34ad..ee942f4c13bf 100644 --- a/python/ray/tests/test_draining.py +++ b/python/ray/tests/test_draining.py @@ -4,7 +4,7 @@ import ray import time from ray._raylet import GcsClient -from ray.core.generated import autoscaler_pb2, gcs_pb2 +from ray.core.generated import autoscaler_pb2, common_pb2 from ray._private.test_utils import wait_for_condition from ray.util.scheduling_strategies import ( NodeAffinitySchedulingStrategy, @@ -69,7 +69,7 @@ def drain_until_accept(): ) worker_node = [node for node in ray.nodes() if node["NodeID"] == worker_node_id][0] - assert worker_node["DeathReason"] == gcs_pb2.NodeDeathInfo.Reason.Value( + assert worker_node["DeathReason"] == common_pb2.NodeDeathInfo.Reason.Value( "AUTOSCALER_DRAIN_IDLE" ) assert worker_node["DeathReasonMessage"] == "idle for long enough" @@ -137,7 +137,7 @@ def ping(self): ) worker_node = [node for node in ray.nodes() if node["NodeID"] == worker_node_id][0] - assert worker_node["DeathReason"] == gcs_pb2.NodeDeathInfo.Reason.Value( + assert worker_node["DeathReason"] == common_pb2.NodeDeathInfo.Reason.Value( "AUTOSCALER_DRAIN_PREEMPTED" ) assert worker_node["DeathReasonMessage"] == "preemption" @@ -195,7 +195,7 @@ def ping(self): ) worker_node = [node for node in ray.nodes() if node["NodeID"] == worker_node_id][0] - assert worker_node["DeathReason"] == gcs_pb2.NodeDeathInfo.Reason.Value( + assert worker_node["DeathReason"] == common_pb2.NodeDeathInfo.Reason.Value( "AUTOSCALER_DRAIN_PREEMPTED" ) assert worker_node["DeathReasonMessage"] == "preemption" @@ -251,7 +251,7 @@ def ping(self): # Since worker node failure is detected to be before the draining deadline, # this is considered as an unexpected termination. worker_node = [node for node in ray.nodes() if node["NodeID"] == worker_node_id][0] - assert worker_node["DeathReason"] == gcs_pb2.NodeDeathInfo.Reason.Value( + assert worker_node["DeathReason"] == common_pb2.NodeDeathInfo.Reason.Value( "UNEXPECTED_TERMINATION" ) assert ( @@ -412,11 +412,12 @@ def ping(self): actor = Actor.options(num_cpus=0, resources={"node2": 1}).remote() ray.get(actor.ping.remote()) + drain_reason_message = "testing node preemption." # Preemption is always accepted. is_accepted, _ = gcs_client.drain_node( node2_id, autoscaler_pb2.DrainNodeReason.Value("DRAIN_NODE_REASON_PREEMPTION"), - "preemption", + drain_reason_message, 1, ) assert is_accepted @@ -426,8 +427,11 @@ def ping(self): try: ray.get(actor.ping.remote()) raise - except ray.exceptions.RayActorError as e: + except ray.exceptions.ActorDiedError as e: assert e.preempted + if graceful: + assert "The actor died because its node has died." in str(e) + assert "the actor's node was preempted: " + drain_reason_message in str(e) if __name__ == "__main__": diff --git a/python/ray/tests/test_node_death.py b/python/ray/tests/test_node_death.py index de36091c4341..cc2ef747102b 100644 --- a/python/ray/tests/test_node_death.py +++ b/python/ray/tests/test_node_death.py @@ -4,7 +4,7 @@ import ray from ray._private.test_utils import wait_for_condition -from ray.core.generated import gcs_pb2 +from ray.core.generated import common_pb2 def test_normal_termination(ray_start_cluster): @@ -14,17 +14,35 @@ def test_normal_termination(ray_start_cluster): worker_node = cluster.add_node(resources={"worker": 1}) cluster.wait_for_nodes() worker_node_id = worker_node.node_id + + @ray.remote + class Actor: + def ping(self): + pass + + actor = Actor.options(num_cpus=0, resources={"worker": 1}).remote() + ray.get(actor.ping.remote()) + + # normal node termination cluster.remove_node(worker_node) worker_node_info = [ node for node in ray.nodes() if node["NodeID"] == worker_node_id ][0] assert not worker_node_info["Alive"] - assert worker_node_info["DeathReason"] == gcs_pb2.NodeDeathInfo.Reason.Value( + assert worker_node_info["DeathReason"] == common_pb2.NodeDeathInfo.Reason.Value( "EXPECTED_TERMINATION" ) assert worker_node_info["DeathReasonMessage"] == "received SIGTERM" + try: + ray.get(actor.ping.remote()) + raise + except ray.exceptions.ActorDiedError as e: + assert not e.preempted + assert "The actor died because its node has died." in str(e) + assert "the actor's node was terminated expectedly: received SIGTERM" in str(e) + def test_abnormal_termination(monkeypatch, ray_start_cluster): monkeypatch.setenv("RAY_health_check_failure_threshold", "3") @@ -46,6 +64,14 @@ def test_abnormal_termination(monkeypatch, ray_start_cluster): == {head_node_id, worker_node_id} ) + @ray.remote + class Actor: + def ping(self): + pass + + actor = Actor.options(num_cpus=0, resources={"worker": 1}).remote() + ray.get(actor.ping.remote()) + # Simulate the worker node crashes. cluster.remove_node(worker_node, False) @@ -55,7 +81,7 @@ def test_abnormal_termination(monkeypatch, ray_start_cluster): ) worker_node = [node for node in ray.nodes() if node["NodeID"] == worker_node_id][0] - assert worker_node["DeathReason"] == gcs_pb2.NodeDeathInfo.Reason.Value( + assert worker_node["DeathReason"] == common_pb2.NodeDeathInfo.Reason.Value( "UNEXPECTED_TERMINATION" ) assert ( @@ -63,6 +89,17 @@ def test_abnormal_termination(monkeypatch, ray_start_cluster): == "health check failed due to missing too many heartbeats" ) + try: + ray.get(actor.ping.remote()) + raise + except ray.exceptions.ActorDiedError as e: + assert not e.preempted + assert "The actor died because its node has died." in str(e) + assert ( + "the actor's node was terminated unexpectedly: " + "health check failed due to missing too many heartbeats" in str(e) + ) + if __name__ == "__main__": sys.exit(pytest.main(["-sv", __file__])) diff --git a/python/ray/tests/test_runtime_env_agent.py b/python/ray/tests/test_runtime_env_agent.py index 4af9165754a3..0320bcba8b94 100644 --- a/python/ray/tests/test_runtime_env_agent.py +++ b/python/ray/tests/test_runtime_env_agent.py @@ -13,7 +13,7 @@ init_error_pubsub, wait_for_condition, ) -from ray.core.generated import gcs_pb2 +from ray.core.generated import common_pb2 from ray.runtime_env import RuntimeEnv import psutil @@ -160,7 +160,7 @@ def test_raylet_and_agent_share_fate(shutdown_only): node for node in ray.nodes() if node["NodeID"] == worker_node_id ][0] assert not worker_node_info["Alive"] - assert worker_node_info["DeathReason"] == gcs_pb2.NodeDeathInfo.Reason.Value( + assert worker_node_info["DeathReason"] == common_pb2.NodeDeathInfo.Reason.Value( "UNEXPECTED_TERMINATION" ) assert ( diff --git a/src/ray/core_worker/transport/direct_actor_task_submitter.cc b/src/ray/core_worker/transport/direct_actor_task_submitter.cc index 7c740f4cea70..0bda386195cc 100644 --- a/src/ray/core_worker/transport/direct_actor_task_submitter.cc +++ b/src/ray/core_worker/transport/direct_actor_task_submitter.cc @@ -347,12 +347,13 @@ void CoreWorkerDirectActorTaskSubmitter::FailTaskWithError( // Special error for preempted actor. The task "timed out" because the actor may // not have sent a notification to the gcs; regardless we already know it's // preempted and it's dead. - rpc::ActorDeathCause &actor_death_cause = *error_info.mutable_actor_died_error(); - actor_death_cause.mutable_actor_died_error_context()->set_actor_id( - task.task_spec.ActorId().Binary()); - actor_death_cause.mutable_actor_died_error_context()->set_preempted( - task.actor_preempted); - + auto actor_death_cause = error_info.mutable_actor_died_error(); + auto actor_died_error_context = actor_death_cause->mutable_actor_died_error_context(); + actor_died_error_context->set_actor_id(task.task_spec.ActorId().Binary()); + auto node_death_info = actor_died_error_context->mutable_node_death_info(); + node_death_info->set_reason(rpc::NodeDeathInfo::AUTOSCALER_DRAIN_PREEMPTED); + node_death_info->set_reason_message( + "the node was inferred to be dead due to draining."); error_info.set_error_type(rpc::ErrorType::ACTOR_DIED); error_info.set_error_message("Actor died by preemption."); } diff --git a/src/ray/gcs/gcs_server/gcs_actor_manager.cc b/src/ray/gcs/gcs_server/gcs_actor_manager.cc index 008b6c13e3a2..fb89e092f256 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_actor_manager.cc @@ -205,17 +205,35 @@ const ray::rpc::ActorDeathCause GcsActorManager::GenNodeDiedCause( const std::string ip_address, std::shared_ptr node) { ray::rpc::ActorDeathCause death_cause; + auto actor_died_error_ctx = death_cause.mutable_actor_died_error_context(); AddActorInfo(actor, actor_died_error_ctx); - actor_died_error_ctx->set_error_message( - absl::StrCat("The actor is dead because its node has died. Node Id: ", - NodeID::FromBinary(node->node_id()).Hex())); - - // TODO(vitsai): Publish this information as well - if (auto death_info = node->death_info(); - death_info.reason() == rpc::NodeDeathInfo::AUTOSCALER_DRAIN_PREEMPTED) { - actor_died_error_ctx->set_preempted(true); + auto node_death_info = actor_died_error_ctx->mutable_node_death_info(); + node_death_info->CopyFrom(node->death_info()); + + std::ostringstream oss; + oss << "The actor died because its node has died. Node Id: " + << NodeID::FromBinary(node->node_id()).Hex() << "\n"; + switch (node_death_info->reason()) { + case rpc::NodeDeathInfo::EXPECTED_TERMINATION: + oss << "\tthe actor's node was terminated expectedly: "; + break; + case rpc::NodeDeathInfo::UNEXPECTED_TERMINATION: + oss << "\tthe actor's node was terminated unexpectedly: "; + break; + case rpc::NodeDeathInfo::AUTOSCALER_DRAIN_PREEMPTED: + oss << "\tthe actor's node was preempted: "; + break; + default: + // Control should not reach here, but in case it happens in unexpected scenarios, + // log it and provide a generic message to the user. + RAY_LOG(ERROR) << "Actor death is not expected to be caused by " + << rpc::NodeDeathInfo_Reason_Name(node_death_info->reason()); + oss << "\tthe actor's node was terminated: "; } + oss << node_death_info->reason_message(); + actor_died_error_ctx->set_error_message(oss.str()); + return death_cause; } diff --git a/src/ray/protobuf/common.proto b/src/ray/protobuf/common.proto index 05c148b3fb2b..8f6d9c27cdee 100644 --- a/src/ray/protobuf/common.proto +++ b/src/ray/protobuf/common.proto @@ -308,6 +308,20 @@ message RayException { string formatted_exception_string = 3; } +message NodeDeathInfo { + // TODO(sang): Update drain reason + enum Reason { + UNSPECIFIED = 0; + EXPECTED_TERMINATION = 1; + UNEXPECTED_TERMINATION = 2; + AUTOSCALER_DRAIN_PREEMPTED = 3; + AUTOSCALER_DRAIN_IDLE = 4; + } + Reason reason = 1; + // A message describing the reason for the node death. + string reason_message = 2; +} + message ActorDeathCause { oneof context { // Indicates that this actor is marked as DEAD due to actor creation task failure. @@ -350,8 +364,8 @@ message ActorDiedErrorContext { // Whether the actor had never started running before it died, i.e. it was cancelled // before scheduling had completed. bool never_started = 10; - // Whether the actor was on a preempted node. - bool preempted = 11; + // The node death info, if node death is the cause of actor death. + optional NodeDeathInfo node_death_info = 11; } // Context for task OOM. diff --git a/src/ray/protobuf/gcs.proto b/src/ray/protobuf/gcs.proto index 652310c3b0bf..fd8310b9f566 100644 --- a/src/ray/protobuf/gcs.proto +++ b/src/ray/protobuf/gcs.proto @@ -318,20 +318,6 @@ message NodeSnapshot { repeated string node_activity = 3; } -message NodeDeathInfo { - // TODO(sang): Update drain reason - enum Reason { - UNSPECIFIED = 0; - EXPECTED_TERMINATION = 1; - UNEXPECTED_TERMINATION = 2; - AUTOSCALER_DRAIN_PREEMPTED = 3; - AUTOSCALER_DRAIN_IDLE = 4; - } - Reason reason = 1; - // A message describing the reason for the node death. - string reason_message = 2; -} - message GcsNodeInfo { // State of a node. enum GcsNodeState {