FEAT: Slurm Deployment For Xorbits #3295
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Python CI | |
on: | |
push: | |
branches: | |
- '*' | |
pull_request: | |
types: ['opened', 'reopened', 'synchronize'] | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.ref }} | |
cancel-in-progress: true | |
jobs: | |
lint: | |
runs-on: ${{ matrix.os }} | |
strategy: | |
fail-fast: false | |
matrix: | |
os: [ "ubuntu-latest" ] | |
python-version: [ "3.10" ] | |
steps: | |
- name: Check out code | |
uses: actions/checkout@v3 | |
with: | |
fetch-depth: 0 | |
submodules: recursive | |
- name: Set up Python environment | |
uses: actions/setup-python@v4 | |
with: | |
python-version: "3.10" | |
- name: flake8 Lint | |
uses: py-actions/flake8@v2 | |
with: | |
path: "python/xorbits" | |
args: "--config python/setup.cfg" | |
- name: black | |
uses: psf/black@stable | |
with: | |
src: "python/xorbits" | |
options: "--check" | |
- uses: isort/isort-action@master | |
with: | |
sortPaths: "python/xorbits" | |
configuration: "--check-only --diff --sp python/setup.cfg" | |
- name: mypy | |
run: pip install mypy && cd python && mypy xorbits | |
- name: codespell | |
run: pip install codespell && cd python && codespell xorbits | |
- name: Set up Node.js | |
uses: actions/setup-node@v1 | |
with: | |
node-version: 16 | |
# ESLint and Prettier must be in `package.json` | |
- name: Install Node.js dependencies | |
run: cd python/xorbits/web/ui && npm ci | |
- name: ESLint Check | |
run: cd python/xorbits/web/ui && npx eslint . | |
- name: Prettier Check | |
run: cd python/xorbits/web/ui && ./node_modules/.bin/prettier --check . | |
build_test_job: | |
if: github.repository == 'xorbitsai/xorbits' | |
runs-on: ${{ matrix.os }} | |
needs: lint | |
env: | |
CONDA_ENV: xorbits-test | |
defaults: | |
run: | |
shell: bash -l {0} | |
strategy: | |
fail-fast: false | |
matrix: | |
os: ["ubuntu-latest", "macos-latest", "windows-latest"] | |
python-version: ["3.8", "3.9", "3.10", "3.11"] | |
module: ["xorbits", "kubernetes"] | |
exclude: | |
- { os: macos-latest, python-version: 3.10} | |
- { os: macos-latest, python-version: 3.9} | |
- { os: windows-latest, python-version: 3.10} | |
- { os: windows-latest, python-version: 3.9} | |
- { os: windows-latest, module: kubernetes} | |
- { os: macos-latest, module: kubernetes} | |
include: | |
- { os: ubuntu-latest, module: _mars/dataframe, python-version: 3.9 } | |
- { os: ubuntu-latest, module: _mars/tensor, python-version: 3.9 } | |
- { os: ubuntu-latest, module: _mars/learn, python-version: 3.9 } | |
- { os: ubuntu-latest, module: learn, python-version: 3.9 } | |
- { os: ubuntu-latest, module: mars-core, python-version: 3.9 } | |
- { os: ubuntu-20.04, module: hadoop, python-version: 3.9 } | |
- { os: ubuntu-latest, module: vineyard, python-version: 3.9 } | |
- { os: ubuntu-latest, module: external-storage, python-version: 3.9 } | |
- { os: ubuntu-latest, module: compatibility, python-version: 3.9 } | |
- { os: ubuntu-latest, module: doc-build, python-version: 3.9 } | |
- { os: self-hosted, module: gpu, python-version: 3.9} | |
- { os: ubuntu-latest, module: jax, python-version: 3.9 } | |
- { os: juicefs-ci, module: kubernetes-juicefs, python-version: 3.9 } | |
- { os: slurm-ci, module: slurm, python-version: 3.9 } | |
- { os: ubuntu-latest, module: datasets, python-version: 3.9 } | |
steps: | |
- name: Check out code | |
uses: actions/checkout@v3 | |
with: | |
fetch-depth: 0 | |
submodules: recursive | |
- name: Add msbuild to PATH | |
if: ${{ matrix.os == 'windows-latest'}} | |
uses: microsoft/[email protected] | |
- name: Set up conda ${{ matrix.python-version }} | |
uses: conda-incubator/setup-miniconda@v2 | |
if: ${{ matrix.module != 'gpu' && matrix.module != 'kubernetes-juicefs' }} | |
with: | |
python-version: ${{ matrix.python-version }} | |
activate-environment: ${{ env.CONDA_ENV }} | |
- name: Start minikube | |
uses: medyagh/setup-minikube@master | |
if: ${{ matrix.module == 'kubernetes' }} | |
with: | |
driver: none | |
kubernetes-version: v1.23.12 | |
- name: Install ucx dependencies | |
if: ${{ (matrix.module != 'gpu') && (matrix.os == 'ubuntu-latest') && (matrix.python-version != '3.11') }} | |
run: | | |
conda install -c conda-forge -c rapidsai ucx-proc=*=cpu ucx ucx-py | |
- name: Install dependencies | |
env: | |
MODULE: ${{ matrix.module }} | |
PYTHON: ${{ matrix.python-version }} | |
if: ${{ matrix.module != 'gpu' }} | |
run: | | |
pip install -e "git+https://github.com/xorbitsai/xoscar.git@main#subdirectory=python&egg=xoscar" | |
pip install numpy scipy cython pyftpdlib coverage flaky "numexpr<2.8.5" | |
if [[ "$MODULE" == "xorbits" ]]; then | |
pip install openpyxl | |
fi | |
if [[ "$MODULE" == "mars-core" ]]; then | |
pip install oss2 | |
fi | |
if [[ "$MODULE" == "kubernetes" ]]; then | |
pip install kubernetes | |
fi | |
if [[ "$MODULE" == "kubernetes-juicefs" ]]; then | |
pip install kubernetes | |
if helm version --short >/dev/null 2>&1; then | |
echo "Helm installed. Skipping installation." | |
else | |
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash | |
fi | |
if helm repo list | grep "juicefs" >/dev/null 2>&1; then | |
echo "Repo juicefs existed. Skipping adding to helm repos." | |
else | |
helm repo add juicefs https://juicedata.github.io/charts/ | |
helm repo update | |
helm fetch --untar juicefs/juicefs-csi-driver | |
fi | |
if kubectl get pods -n kube-system | grep "juicefs-csi" >/dev/null 2>&1; then | |
echo "Juicefs-csi-driver installed. Skipping installation." | |
else | |
helm install juicefs-csi-driver juicefs/juicefs-csi-driver -n kube-system -f xorbits/deploy/kubernetes/external_storage/juicefs/tests/values.yaml | |
fi | |
sleep 60 | |
kubectl apply -f xorbits/deploy/kubernetes/external_storage/juicefs/tests/example-redis-config.yaml | |
sleep 60 | |
fi | |
if [[ "$MODULE" == "hadoop" ]]; then | |
../CI/install-hadoop.sh | |
echo "import coverage; coverage.process_startup()" > \ | |
$(python -c "import site; print(site.getsitepackages()[-1])")/coverage.pth | |
conda install --quiet --yes -c conda-forge skein libffi conda-pack grpcio=1.42.0 | |
fi | |
if [[ "$MODULE" == "vineyard" ]]; then | |
pip install "vineyard<0.16.1" -i https://pypi.org/simple | |
mkdir -p /tmp/etcd-download-test | |
export ETCD_VER=v3.4.13 | |
export ETCD_DOWNLOAD_URL=https://github.com/etcd-io/etcd/releases/download | |
curl -L $ETCD_DOWNLOAD_URL/$ETCD_VER/etcd-$ETCD_VER-linux-amd64.tar.gz -o /tmp/etcd-$ETCD_VER-linux-amd64.tar.gz | |
tar xzvf /tmp/etcd-$ETCD_VER-linux-amd64.tar.gz -C /tmp/etcd-download-test --strip-components=1 | |
sudo mv /tmp/etcd-download-test/etcd /usr/local/bin/ | |
sudo mv /tmp/etcd-download-test/etcdctl /usr/local/bin/ | |
rm -fr /tmp/etcd-$ETCD_VER-linux-amd64.tar.gz /tmp/etcd-download-test | |
fi | |
if [[ "$MODULE" == "external-storage" ]]; then | |
export CUR_DIR=$(pwd) | |
mkdir -p /tmp/alluxio-download-test | |
cd /tmp/alluxio-download-test | |
wget -q https://downloads.alluxio.io/downloads/files/2.9.3/alluxio-2.9.3-bin.tar.gz | |
tar -xzf alluxio-2.9.3-bin.tar.gz | |
cd alluxio-2.9.3 | |
cp conf/alluxio-env.sh.template conf/alluxio-env.sh | |
cp conf/alluxio-site.properties.template conf/alluxio-site.properties | |
echo "alluxio.master.hostname=localhost" >> conf/alluxio-site.properties | |
cd $CUR_DIR | |
sudo mv /tmp/alluxio-download-test/alluxio-2.9.3 /usr/local/bin/ | |
rm -R /tmp/alluxio-download-test | |
unset CUR_DIR | |
export ALLUXIO_HOME="/usr/local/bin/alluxio-2.9.3" | |
${ALLUXIO_HOME}/bin/alluxio format | |
${ALLUXIO_HOME}/bin/alluxio-start.sh local SudoMount | |
curl -sSL https://d.juicefs.com/install | sh - | |
curl -fsSL https://packages.redis.io/gpg | sudo gpg --dearmor -o /usr/share/keyrings/redis-archive-keyring.gpg | |
echo "deb [signed-by=/usr/share/keyrings/redis-archive-keyring.gpg] https://packages.redis.io/deb $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/redis.list | |
sudo apt-get update | |
sudo apt-get install redis | |
export JUICEFS_HOME="/usr/local/bin/juicefs" | |
fi | |
if [[ "$PYTHON" == '3.9' ]]; then | |
pip install h5py zarr matplotlib fastparquet | |
conda install -n test --quiet --yes -c conda-forge python=$PYTHON \ | |
"tiledb-py>=0.4.3,<0.6.0" "tiledb<2.0.0" || true | |
fi | |
if [[ "$MODULE" == "_mars/learn" ]]; then | |
pip install scikit-learn | |
pip install xgboost lightgbm keras tensorflow faiss-cpu\<1.7.3 torch torchvision \ | |
statsmodels tsfresh dask[complete] mimesis\<9.0.0 | |
fi | |
if [[ "$MODULE" == "learn" ]]; then | |
pip install xgboost lightgbm | |
fi | |
if [[ "$MODULE" == "ray-dag" ]] || [[ "$MODULE" == "ray-deploy" ]]; then | |
pip install "xgboost_ray<0.1.14" "protobuf<4" "sqlalchemy<2" | |
fi | |
if [[ "$MODULE" == "compatibility" ]]; then | |
# test if compatible with older versions | |
pip install pandas==1.5.3 pyarrow\<12.0.0 sqlalchemy\<2 | |
fi | |
if [[ "$MODULE" == "jax" ]]; then | |
# test jax | |
pip install "jax>=0.4.0" "jaxlib>=0.4.0" | |
fi | |
if [[ "$MODULE" == "datasets" ]]; then | |
# test huggingface datasets | |
pip install datasets torch | |
fi | |
pip install -e ".[dev,extra,aws]" | |
working-directory: ./python | |
- name: Install on JuiceFsCI | |
if: ${{ matrix.module == 'kubernetes-juicefs' }} | |
run: | | |
pip install -U xoscar | |
python setup.py build_ext -i | |
working-directory: ./python | |
- name: Slurm Setup Job queuing system | |
if: ${{ matrix.module == 'slurm' }} | |
run: | | |
source CI/${{ matrix.module }}.sh | |
jobqueue_before_install | |
- name: Slurm Install xorbits | |
if: ${{ matrix.module == 'slurm' }} | |
run: | | |
source CI/${{ matrix.module }}.sh | |
jobqueue_install | |
- name: Install on GPU | |
if: ${{ matrix.module == 'gpu' }} | |
run: | | |
pip install -U xoscar | |
python setup.py build_ext -i | |
working-directory: ./python | |
- name: Build doc | |
if: ${{ matrix.module == 'doc-build' }} | |
run: | | |
make html | |
make html_zh_cn | |
working-directory: ./doc | |
- name: Test with pytest | |
if: ${{ matrix.module != 'doc-build' }} | |
env: | |
MODULE: ${{ matrix.module }} | |
run: | | |
if [[ "$MODULE" == "xorbits" ]]; then | |
pytest --ignore xorbits/_mars/ --ignore xorbits/xgboost --ignore xorbits/lightgbm \ | |
--ignore xorbits/datasets --timeout=1500 \ | |
-W ignore::PendingDeprecationWarning \ | |
--cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits | |
elif [[ "$MODULE" == "mars-core" ]]; then | |
pytest --forked --log-level=DEBUG --ignore xorbits/_mars/dataframe --ignore xorbits/_mars/tensor \ | |
--ignore xorbits/_mars/learn --ignore xorbits/_mars/remote \ | |
--timeout=1500 \ | |
-W ignore::PendingDeprecationWarning \ | |
--cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/_mars | |
elif [[ "$MODULE" == "kubernetes" ]]; then | |
pytest --ignore xorbits/_mars/ --timeout=1500 \ | |
-W ignore::PendingDeprecationWarning \ | |
--cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/deploy/kubernetes | |
elif [[ "$MODULE" == "kubernetes-juicefs" ]]; then | |
pytest --ignore xorbits/_mars/ --timeout=1500 \ | |
-W ignore::PendingDeprecationWarning \ | |
--cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/deploy/kubernetes/external_storage/juicefs | |
elif [[ "$MODULE" == "slurm" ]]; then | |
docker exec c1 /bin/bash -c "pip install xorbits" | |
docker exec c2 /bin/bash -c "pip install xorbits" | |
docker exec slurmctld /bin/bash -c \ | |
"pytest --ignore xorbits/_mars/ --timeout=1500 \ | |
-W ignore::PendingDeprecationWarning \ | |
--cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/deploy/cluster" | |
elif [[ "$MODULE" == "hadoop" ]]; then | |
export WITH_HADOOP="1" | |
export HADOOP_HOME="/usr/local/hadoop" | |
export HADOOP_INSTALL=$HADOOP_HOME | |
export HADOOP_MAPRED_HOME=$HADOOP_HOME | |
export HADOOP_COMMON_HOME=$HADOOP_HOME | |
export HADOOP_HDFS_HOME=$HADOOP_HOME | |
export YARN_HOME=$HADOOP_HOME | |
export HADOOP_COMMON_LIB_NATIVE_DIR="$HADOOP_HOME/lib/native" | |
export PATH="$PATH:$HADOOP_HOME/sbin:$HADOOP_HOME/bin" | |
pytest --timeout=1500 -W ignore::PendingDeprecationWarning xorbits/_mars -m hadoop | |
elif [[ "$MODULE" == "vineyard" ]]; then | |
pytest --timeout=1500 -W ignore::PendingDeprecationWarning \ | |
--cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/_mars/storage/tests/test_libs.py | |
pytest --timeout=1500 -W ignore::PendingDeprecationWarning \ | |
--cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/_mars/deploy/oscar/tests/test_local.py | |
pytest --timeout=1500 -W ignore::PendingDeprecationWarning \ | |
--cov-config=setup.cfg --cov-report=xml --cov=xorbits -k "vineyard" \ | |
xorbits/_mars/tensor/datastore/tests/test_datastore_execution.py \ | |
xorbits/_mars/dataframe/datastore/tests/test_datastore_execution.py | |
elif [[ "$MODULE" == "external-storage" ]]; then | |
export ALLUXIO_HOME="/usr/local/bin/alluxio-2.9.3" | |
export JUICEFS_HOME="/usr/local/bin/juicefs" | |
pytest --timeout=1500 -W ignore::PendingDeprecationWarning \ | |
--cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/_mars/storage/tests/test_libs.py | |
elif [[ "$MODULE" == "_mars/learn" ]]; then | |
pytest --timeout=1500 \ | |
-W ignore::PendingDeprecationWarning \ | |
--cov-config=setup.cfg --cov-report=xml --cov=xorbits \ | |
xorbits/$MODULE xorbits/_mars/contrib/dask/tests/test_dask.py | |
elif [[ "$MODULE" == "learn" ]]; then | |
pytest --timeout=1500 \ | |
-W ignore::PendingDeprecationWarning \ | |
--cov-config=setup.cfg --cov-report=xml --cov=xorbits \ | |
xorbits/xgboost xorbits/lightgbm | |
elif [ "$MODULE" == "ray-deploy" ]; then | |
pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits --durations=0 \ | |
--log-level=DEBUG --timeout=200 xorbits/_mars --ignore=xorbits/_mars/deploy/oscar/ -m ray | |
pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits --durations=0 \ | |
--log-level=DEBUG --timeout=200 xorbits/_mars/deploy/oscar/tests/test_ray.py -m ray | |
pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits --durations=0 \ | |
--log-level=DEBUG --timeout=200 xorbits/_mars/deploy/oscar/tests/test_ray_load_modules.py -m ray | |
pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits --durations=0 \ | |
--log-level=DEBUG --timeout=200 xorbits/_mars/deploy/oscar/tests/test_ray_cluster_standalone.py -m ray | |
pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits --durations=0 \ | |
--log-level=DEBUG --timeout=200 xorbits/_mars/deploy/oscar/tests/test_ray_client.py -m ray | |
pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits --durations=0 \ | |
--log-level=DEBUG --timeout=200 xorbits/_mars/deploy/oscar/tests/test_ray_fault_injection.py -m ray | |
pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits --durations=0 \ | |
--log-level=DEBUG --timeout=200 xorbits/_mars/deploy/oscar/tests/test_ray_scheduling.py -m ray | |
elif [ "$MODULE" == "ray-dag" ]; then | |
export MARS_CI_BACKEND=ray | |
export RAY_idle_worker_killing_time_threshold_ms=60000 | |
pytest --cov-config=setup.cfg --cov-report=xml --durations=0 --timeout=500 xorbits/_mars/dataframe \ | |
-v -s -m "not skip_ray_dag" --ignore=xorbits/_mars/dataframe/contrib/raydataset | |
pytest --cov-config=setup.cfg --cov-report=xml --durations=0 \ | |
--timeout=500 xorbits/_mars/dataframe/contrib/raydataset -v -s -m "not skip_ray_dag" | |
pytest --cov-config=setup.cfg --cov-report=xml --durations=0 \ | |
--timeout=500 xorbits/_mars/tensor -v -s -m "not skip_ray_dag" | |
pytest --cov-config=setup.cfg --cov-report=xml --durations=0 \ | |
--timeout=500 xorbits/_mars/learn --ignore xorbits/_mars/learn/contrib \ | |
--ignore xorbits/_mars/learn/utils/tests/test_collect_ports.py -m "not skip_ray_dag" | |
pytest --cov-config=setup.cfg --cov-report=xml --durations=0 \ | |
--timeout=200 xorbits -v -s -m ray_dag | |
pytest --cov-config=setup.cfg --cov-report=xml --durations=0 \ | |
--timeout=200 xorbits/_mars/deploy/oscar/tests/test_ray_dag.py | |
pytest --cov-config=setup.cfg --cov-report=xml --durations=0 \ | |
--timeout=200 xorbits/_mars/deploy/oscar/tests/test_ray_dag_failover.py | |
pytest --cov-config=setup.cfg --cov-report=xml --durations=0 \ | |
--timeout=200 xorbits/_mars/deploy/oscar/tests/test_ray_dag_oscar.py -m ray | |
elif [ "$MODULE" == "gpu" ]; then | |
pytest -m cuda --gpu --ignore xorbits/datasets --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits | |
elif [ "$MODULE" == "jax" ]; then | |
pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/_mars/tensor/fuse/tests/test_runtime_fusion.py | |
pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/_mars/tensor/ | |
elif [ "$MODULE" == "datasets" ]; then | |
pytest --cov-config=setup.cfg --cov-report=xml --cov=xorbits xorbits/datasets | |
elif [ "$MODULE" == "compatibility" ]; then | |
pytest --timeout=1500 \ | |
-W ignore::PendingDeprecationWarning \ | |
--cov-config=setup.cfg --cov-report=xml --cov=xorbits/deploy --cov=xorbits xorbits/_mars/dataframe | |
pytest --timeout=1500 \ | |
-W ignore::PendingDeprecationWarning \ | |
--cov-config=setup.cfg --cov-report=xml --cov=xorbits/deploy --cov=xorbits xorbits/_mars/services/storage | |
else | |
pytest --timeout=1500 \ | |
-W ignore::PendingDeprecationWarning \ | |
--cov-config=setup.cfg --cov-report=xml --cov=xorbits/deploy --cov=xorbits xorbits/$MODULE | |
fi | |
working-directory: ./python | |
- name: Cleanup on slurm | |
if: ${{ matrix.module == 'slurm' }} | |
run: | | |
source CI/${{ matrix.module }}.sh | |
jobqueue_after_script | |
- name: Report coverage data | |
uses: codecov/codecov-action@v3 | |
with: | |
working-directory: ./python | |
flags: unittests | |
token: ${{ secrets.CODECOV_TOKEN }} |