[Train] Rename Ray SGD v2 to Ray Train (ray-project#19436)

dongruixiao · Oct 19, 2021 · 4674c78 · 4674c78
1 parent 46b4c74
commit 4674c78
Show file tree

Hide file tree

Showing 98 changed files with 925 additions and 897 deletions.
diff --git a/.buildkite/pipeline.gpu.large.yml b/.buildkite/pipeline.gpu.large.yml
@@ -1,8 +1,8 @@
-- label: ":tv: :octopus: SGD GPU tests "
-  conditions: ["RAY_CI_SGD_AFFECTED"]
+- label: ":tv: :steam_locomotive: Train GPU tests "
+  conditions: ["RAY_CI_TRAIN_AFFECTED"]
   commands:
     - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/travis/upload_build_info.sh; fi }; trap cleanup EXIT
     - SGD_TESTING=1 INSTALL_HOROVOD=1 ./ci/travis/install-dependencies.sh
     - pip install -Ur ./python/requirements_ml_docker.txt
     - ./ci/travis/env_info.sh
-    - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=gpu,gpu_only python/ray/util/sgd/...
+    - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=gpu,gpu_only python/ray/train/...
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -483,6 +483,13 @@
     - ./ci/travis/install-dependencies.sh
     - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=soft_imports python/ray/tune/...
 
+- label: ":steam_locomotive: Train tests and examples"
+  conditions: ["RAY_CI_TRAIN_AFFECTED"]
+  commands:
+    - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/travis/upload_build_info.sh; fi }; trap cleanup EXIT
+    - SGD_TESTING=1 INSTALL_HOROVOD=1 ./ci/travis/install-dependencies.sh
+    - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-gpu_only python/ray/train/...
+
 - label: ":octopus: SGD tests and examples"
   conditions: ["RAY_CI_SGD_AFFECTED"]
   commands:
@@ -491,7 +498,6 @@
     - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tf,-pytorch,-py37,-flaky,-client,-gpu_only python/ray/util/sgd/...
     - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,pytorch,-py37,-flaky,-client,-gpu_only python/ray/util/sgd/...
     - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=client_unit_tests,-gpu_only --test_env=RAY_CLIENT_MODE=1 python/ray/util/sgd/...
-    - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-gpu_only python/ray/util/sgd/v2/...
 
 - label: ":octopus: Tune/SGD/Modin/Dask tests and examples. Python 3.7"
   conditions: ["RAY_CI_TUNE_AFFECTED", "RAY_CI_SGD_AFFECTED"]

diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -33,11 +33,11 @@ body:
         - "Ray Core"
         - "Ray Tune"
         - "Ray Serve"
+        - "Ray Train"
         - "RLlib"
         - "Ray Clusters"
         - "Monitoring & Debugging"
         - "Dashboard"
-        - "RaySGD"
         - "Others"
     validations:
       required: true

diff --git a/README.rst b/README.rst
@@ -21,7 +21,7 @@ Ray is packaged with the following libraries for accelerating machine learning w
 
 - `Tune`_: Scalable Hyperparameter Tuning
 - `RLlib`_: Scalable Reinforcement Learning
-- `RaySGD <https://docs.ray.io/en/master/raysgd/raysgd.html>`__: Distributed Training Wrappers
+- `Train`_: Distributed Deep Learning (alpha)
 - `Datasets`_: Flexible Distributed Data Loading (beta)
 
 As well as libraries for taking ML and distributed apps to production:
@@ -43,6 +43,7 @@ Install Ray with: ``pip install ray``. For nightly wheels, see the
 .. _`Serve`: https://docs.ray.io/en/master/serve/index.html
 .. _`Datasets`: https://docs.ray.io/en/master/data/dataset.html
 .. _`Workflows`: https://docs.ray.io/en/master/workflows/concepts.html
+.. _`Train`: https://docs.ray.io/en/master/train/train.html
 
 
 Quick Start

diff --git a/ci/travis/determine_tests_to_run.py b/ci/travis/determine_tests_to_run.py
@@ -72,6 +72,7 @@ def get_commit_range():
 
     RAY_CI_TUNE_AFFECTED = 0
     RAY_CI_SGD_AFFECTED = 0
+    RAY_CI_TRAIN_AFFECTED = 0
     RAY_CI_ONLY_RLLIB_AFFECTED = 0  # Whether only RLlib is affected.
     RAY_CI_RLLIB_AFFECTED = 0  # Whether RLlib minimal tests should be run.
     RAY_CI_RLLIB_FULL_AFFECTED = 0  # Whether full RLlib tests should be run.
@@ -110,6 +111,10 @@ def get_commit_range():
                 RAY_CI_SGD_AFFECTED = 1
                 RAY_CI_LINUX_WHEELS_AFFECTED = 1
                 RAY_CI_MACOS_WHEELS_AFFECTED = 1
+            elif changed_file.startswith("python/ray/train"):
+                RAY_CI_TRAIN_AFFECTED = 1
+                RAY_CI_LINUX_WHEELS_AFFECTED = 1
+                RAY_CI_MACOS_WHEELS_AFFECTED = 1
             elif re.match("^(python/ray/)?rllib/", changed_file):
                 RAY_CI_RLLIB_AFFECTED = 1
                 RAY_CI_RLLIB_FULL_AFFECTED = 1
@@ -133,6 +138,7 @@ def get_commit_range():
             elif changed_file.startswith("python/"):
                 RAY_CI_TUNE_AFFECTED = 1
                 RAY_CI_SGD_AFFECTED = 1
+                RAY_CI_TRAIN_AFFECTED = 1
                 RAY_CI_RLLIB_AFFECTED = 1
                 RAY_CI_SERVE_AFFECTED = 1
                 RAY_CI_PYTHON_AFFECTED = 1
@@ -167,6 +173,7 @@ def get_commit_range():
             elif changed_file.startswith("src/"):
                 RAY_CI_TUNE_AFFECTED = 1
                 RAY_CI_SGD_AFFECTED = 1
+                RAY_CI_TRAIN_AFFECTED = 1
                 RAY_CI_RLLIB_AFFECTED = 1
                 RAY_CI_SERVE_AFFECTED = 1
                 RAY_CI_JAVA_AFFECTED = 1
@@ -189,6 +196,7 @@ def get_commit_range():
             else:
                 RAY_CI_TUNE_AFFECTED = 1
                 RAY_CI_SGD_AFFECTED = 1
+                RAY_CI_TRAIN_AFFECTED = 1
                 RAY_CI_RLLIB_AFFECTED = 1
                 RAY_CI_SERVE_AFFECTED = 1
                 RAY_CI_JAVA_AFFECTED = 1
@@ -203,6 +211,7 @@ def get_commit_range():
     else:
         RAY_CI_TUNE_AFFECTED = 1
         RAY_CI_SGD_AFFECTED = 1
+        RAY_CI_TRAIN_AFFECTED = 1
         RAY_CI_RLLIB_AFFECTED = 1
         RAY_CI_RLLIB_FULL_AFFECTED = 1
         RAY_CI_SERVE_AFFECTED = 1
@@ -221,13 +230,15 @@ def get_commit_range():
             RAY_CI_STREAMING_CPP_AFFECTED and \
             not RAY_CI_STREAMING_PYTHON_AFFECTED and \
             not RAY_CI_STREAMING_JAVA_AFFECTED and \
-            not RAY_CI_SGD_AFFECTED:
+            not RAY_CI_SGD_AFFECTED and\
+            not RAY_CI_TRAIN_AFFECTED:
         RAY_CI_ONLY_RLLIB_AFFECTED = 1
 
     # Log the modified environment variables visible in console.
     output_string = " ".join([
         "RAY_CI_TUNE_AFFECTED={}".format(RAY_CI_TUNE_AFFECTED),
         "RAY_CI_SGD_AFFECTED={}".format(RAY_CI_SGD_AFFECTED),
+        "RAY_CI_TRAIN_AFFECTED={}".format(RAY_CI_TRAIN_AFFECTED),
         "RAY_CI_ONLY_RLLIB_AFFECTED={}".format(RAY_CI_ONLY_RLLIB_AFFECTED),
         "RAY_CI_RLLIB_AFFECTED={}".format(RAY_CI_RLLIB_AFFECTED),
         "RAY_CI_RLLIB_FULL_AFFECTED={}".format(RAY_CI_RLLIB_FULL_AFFECTED),

diff --git a/doc/source/data/dataset-pipeline.rst b/doc/source/data/dataset-pipeline.rst
@@ -230,8 +230,8 @@ Example: Per-Epoch Shuffle Pipeline
 .. tip::
 
     If you interested in distributed ingest for deep learning, it is
-    recommended to use Ray Datasets in conjunction with :ref:`Ray SGD <sgd-v2-docs>`.
-    See the :ref:`example below<dataset-pipeline-ray-sgd>` for more info.
+    recommended to use Ray Datasets in conjunction with :ref:`Ray Train <train-docs>`.
+    See the :ref:`example below<dataset-pipeline-ray-train>` for more info.
 
 ..
   https://docs.google.com/drawings/d/1vWQ-Zfxy2_Gthq8l3KmNsJ7nOCuYUQS9QMZpj5GHYx0/edit
@@ -300,13 +300,13 @@ Similar to how you can ``.split()`` a Dataset, you can also split a DatasetPipel
 
 .. image:: dataset-repeat-2.svg
 
-.. _dataset-pipeline-ray-sgd:
+.. _dataset-pipeline-ray-train:
 
-Distributed Ingest with Ray SGD
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Ray Datasets integrates with :ref:`Ray SGD <sgd-v2-docs>`, further simplifying your distributed ingest pipeline.
+Distributed Ingest with Ray Train
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Ray Datasets integrates with :ref:`Ray Train <train-docs>`, further simplifying your distributed ingest pipeline.
 
-Ray SGD is a lightweight library for scalable deep learning on Ray.
+Ray Train is a lightweight library for scalable deep learning on Ray.
 
 1. It allows you to focus on the training logic and automatically handles distributed setup for your framework of choice (PyTorch, Tensorflow, or Horovod).
 2. It has out of the box fault-tolerance and elastic training
@@ -319,7 +319,7 @@ Ray SGD is a lightweight library for scalable deep learning on Ray.
     def train_func():
         # This is a dummy train function just iterating over the dataset shard.
         # You should replace this with your training logic.
-        shard = ray.sgd.get_dataset_shard()
+        shard = ray.train.get_dataset_shard()
         for row in shard.iter_rows():
             print(row)
 
@@ -338,8 +338,8 @@ Ray SGD is a lightweight library for scalable deep learning on Ray.
         config={"worker_batch_size": 64, "num_epochs": 2},
         dataset=pipe)
 
-Ray SGD is responsible for the orchestration of the training workers and will automatically split the Dataset for you.
-See :ref:`the SGD User Guide <sgd-dataset-pipeline>` for more details.
+Ray Train is responsible for the orchestration of the training workers and will automatically split the Dataset for you.
+See :ref:`the Train User Guide <train-dataset-pipeline>` for more details.
 
 Changing Pipeline Structure
 ---------------------------

diff --git a/doc/source/data/dataset.rst b/doc/source/data/dataset.rst
@@ -16,7 +16,7 @@ Ray Datasets are the standard way to load and exchange data in Ray libraries and
 
 Concepts
 --------
-Ray Datasets implement `Distributed Arrow <https://arrow.apache.org/>`__. A Dataset consists of a list of Ray object references to *blocks*. Each block holds a set of items in either an `Arrow table <https://arrow.apache.org/docs/python/data.html#tables>`__ or a Python list (for Arrow incompatible objects). Having multiple blocks in a dataset allows for parallel transformation and ingest of the data (e.g., into :ref:`Ray SGD <sgd-v2-docs>` for ML training).
+Ray Datasets implement `Distributed Arrow <https://arrow.apache.org/>`__. A Dataset consists of a list of Ray object references to *blocks*. Each block holds a set of items in either an `Arrow table <https://arrow.apache.org/docs/python/data.html#tables>`__ or a Python list (for Arrow incompatible objects). Having multiple blocks in a dataset allows for parallel transformation and ingest of the data (e.g., into :ref:`Ray Train <train-docs>` for ML training).
 
 The following figure visualizes a Dataset that has three Arrow table blocks, each block holding 1000 rows each:
 

diff --git a/doc/source/data/raydp.rst b/doc/source/data/raydp.rst
@@ -140,7 +140,7 @@ PyTorch.
       return torch.optim.lr_scheduler.MultiStepLR(
         optimizer, milestones=[150, 250, 350], gamma=0.1)
 
-  # You can use the RayDP Estimator API or libraries like RaySGD for distributed training.
+  # You can use the RayDP Estimator API or libraries like Ray Train for distributed training.
   from raydp.torch import TorchEstimator
   estimator = TorchEstimator(
     num_workers = 2,

diff --git a/doc/source/index.rst b/doc/source/index.rst
@@ -332,14 +332,14 @@ Papers
 .. toctree::
    :hidden:
    :maxdepth: -1
-   :caption: Ray SGD
-
-   raysgd/v2/raysgd.rst
-   raysgd/v2/user_guide.rst
-   raysgd/v2/examples.rst
-   raysgd/v2/architecture.rst
-   raysgd/v2/api.rst
-   raysgd/v2/migration-guide.rst
+   :caption: Ray Train
+
+   train/train.rst
+   train/user_guide.rst
+   train/examples.rst
+   train/architecture.rst
+   train/api.rst
+   train/migration-guide.rst
    RaySGD v1: Distributed Training Wrappers <raysgd/raysgd.rst>
 
 .. toctree::

diff --git a/doc/source/lightgbm-ray.rst b/doc/source/lightgbm-ray.rst
@@ -1,9 +1,7 @@
-.. _lightgbm-ray:
-
 ..
   This part of the docs is generated from the LightGBM-Ray readme using m2r
   To update:
-  - run `m2r /path/to/_lightgbm_ray/README.md`
+  - run `m2r /path/to/lightgbm_ray/README.md`
   - copy the contents of README.rst here
   - Get rid of the badges in the top
   - Get rid of the references section at the bottom
@@ -12,6 +10,8 @@
     by adding a second underscore (use `target <link>`__)
   - Search for `\ **` and delete this from the links (bold links are not supported)
 
+.. _lightgbm-ray:
+
 Distributed LightGBM on Ray
 ===========================
 

diff --git a/doc/source/ray-lightning.rst b/doc/source/ray-lightning.rst
@@ -1,9 +1,7 @@
-.. _ray-lightning:
-
 ..
   This part of the docs is generated from the Ray Lightning readme using m2r
   To update:
-  - run `m2r /path/to/xgboost_ray/README.md`
+  - run `m2r /path/to/ray_lightning/README.md`
   - copy the contents of README.rst here
   - remove the table of contents
   - remove the PyTorch Lightning Compatibility section
@@ -12,6 +10,8 @@
     by adding a second underscore (use `target <link>`__)
   - Search for `\ **` and delete this from the links (bold links are not supported)
 
+.. _ray-lightning:
+
 Distributed PyTorch Lightning Training on Ray
 =============================================
 

diff --git a/doc/source/raysgd/raysgd.rst b/doc/source/raysgd/raysgd.rst
@@ -5,8 +5,8 @@ RaySGD: Distributed Training Wrappers
 =====================================
 
 
-.. warning:: This is an older version of Ray SGD. A newer, more light-weight version of Ray SGD is in alpha as of Ray 1.7.
-         See the documentation :ref:`here <sgd-v2-docs>`. To migrate from v1 to v2 you can follow the :ref:`migration guide <sgd-migration>`.
+.. warning:: This is an older version of Ray SGD. A newer, more light-weight version of Ray SGD (named Ray Train) is in alpha as of Ray 1.7.
+         See the documentation :ref:`here <train-docs>`. To migrate from v1 to v2 you can follow the :ref:`migration guide <sgd-migration>`.
 
 RaySGD is a lightweight library for distributed deep learning, providing thin wrappers around PyTorch and TensorFlow native modules for data parallel training.
 

diff --git a/doc/source/raysgd/raysgd_pytorch.rst b/doc/source/raysgd/raysgd_pytorch.rst
@@ -3,8 +3,8 @@
 Distributed PyTorch
 ===================
 
-.. warning:: This is an older version of Ray SGD. A newer, more light-weight version of Ray SGD is in alpha as of Ray 1.7.
-         See the documentation :ref:`here <sgd-v2-docs>`. To migrate from v1 to v2 you can follow the :ref:`migration guide <sgd-migration>`.
+.. warning:: This is an older version of Ray SGD. A newer, more light-weight version of Ray SGD (named Ray Train) is in alpha as of Ray 1.7.
+         See the documentation :ref:`here <train-docs>`. To migrate from v1 to v2 you can follow the :ref:`migration guide <sgd-migration>`.
 
 The RaySGD ``TorchTrainer`` simplifies distributed model training for PyTorch.
 

diff --git a/doc/source/raysgd/raysgd_tensorflow.rst b/doc/source/raysgd/raysgd_tensorflow.rst
@@ -1,8 +1,8 @@
 Distributed TensorFlow
 ======================
 
-.. warning:: This is an older version of Ray SGD. A newer, more light-weight version of Ray SGD is in alpha as of Ray 1.7.
-         See the documentation :ref:`here <sgd-v2-docs>`. To migrate from v1 to v2 you can follow the :ref:`migration guide <sgd-migration>`.
+.. warning:: This is an older version of Ray SGD. A newer, more light-weight version of Ray SGD (named Ray Train) is in alpha as of Ray 1.7.
+         See the documentation :ref:`here <train-docs>`. To migrate from v1 to v2 you can follow the :ref:`migration guide <sgd-migration>`.
 
 RaySGD's ``TFTrainer`` simplifies distributed model training for Tensorflow. The ``TFTrainer`` is a wrapper around ``MultiWorkerMirroredStrategy`` with a Python API to easily incorporate distributed training into a larger Python application, as opposed to write custom logic of setting environments and starting separate processes.
 

diff --git a/doc/source/raysgd/raysgd_tune.rst b/doc/source/raysgd/raysgd_tune.rst
@@ -3,8 +3,8 @@
 RaySGD Hyperparameter Tuning
 ============================
 
-.. warning:: This is an older version of Ray SGD. A newer, more light-weight version of Ray SGD is in alpha as of Ray 1.7.
-         See the documentation :ref:`here <sgd-v2-docs>`. To migrate from v1 to v2 you can follow the :ref:`migration guide <sgd-migration>`.
+.. warning:: This is an older version of Ray SGD. A newer, more light-weight version of Ray SGD (named Ray Train) is in alpha as of Ray 1.7.
+         See the documentation :ref:`here <train-docs>`. To migrate from v1 to v2 you can follow the :ref:`migration guide <sgd-migration>`.
 
 RaySGD integrates with :ref:`Ray Tune <tune-60-seconds>` to easily run distributed hyperparameter tuning experiments with your RaySGD Trainer.