Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/modin-project/modin into …
Browse files Browse the repository at this point in the history
…issue6708
  • Loading branch information
anmyachev committed Jan 17, 2024
2 parents 97add7c + 43134ef commit 8728894
Show file tree
Hide file tree
Showing 170 changed files with 5,225 additions and 2,912 deletions.
2 changes: 1 addition & 1 deletion .github/actions/run-core-tests/group_3/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,6 @@ runs:
shell: bash -l {0}
- run: |
echo "::group::Running experimental groupby tests (group 3)..."
MODIN_EXPERIMENTAL_GROUPBY=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/test_groupby.py
MODIN_RANGE_PARTITIONING_GROUPBY=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/test_groupby.py
echo "::endgroup::"
shell: bash -l {0}
24 changes: 15 additions & 9 deletions .github/workflows/ci-notebooks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ on:
- setup.cfg
- setup.py
- requirements/env_hdk.yml
- requirements/env_unidist_linux.yml
concurrency:
# Cancel other jobs in the same branch. We don't care whether CI passes
# on old commits.
Expand All @@ -28,12 +29,17 @@ jobs:
steps:
- uses: actions/checkout@v3
- uses: ./.github/actions/python-only
if: matrix.execution != 'hdk_on_native'
if: matrix.execution != 'hdk_on_native' && matrix.execution != 'pandas_on_unidist'
- uses: ./.github/actions/mamba-env
with:
environment-file: requirements/env_hdk.yml
activate-environment: modin_on_hdk
if: matrix.execution == 'hdk_on_native'
- uses: ./.github/actions/mamba-env
with:
environment-file: requirements/env_unidist_linux.yml
activate-environment: modin_on_unidist
if: matrix.execution == 'pandas_on_unidist'
- name: Cache datasets
uses: actions/cache@v2
with:
Expand All @@ -43,29 +49,29 @@ jobs:
# replace modin with . in the tutorial requirements file for `pandas_on_ray` and
# `pandas_on_dask` since we need Modin built from sources
- run: sed -i 's/modin/./g' examples/tutorial/jupyter/execution/${{ matrix.execution }}/requirements.txt
if: matrix.execution != 'hdk_on_native'
if: matrix.execution != 'hdk_on_native' && matrix.execution != 'pandas_on_unidist'
# install dependencies required for notebooks execution for `pandas_on_ray` and `pandas_on_dask`
# Override modin-spreadsheet install for now
- run: |
pip install -r examples/tutorial/jupyter/execution/${{ matrix.execution }}/requirements.txt
pip install git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5
if: matrix.execution != 'hdk_on_native'
# Build Modin from sources for `hdk_on_native`
if: matrix.execution != 'hdk_on_native' && matrix.execution != 'pandas_on_unidist'
# Build Modin from sources for `hdk_on_native` and `pandas_on_unidist`
- run: pip install -e .
if: matrix.execution == 'hdk_on_native'
if: matrix.execution == 'hdk_on_native' || matrix.execution == 'pandas_on_unidist'
# install test dependencies
# NOTE: If you are changing the set of packages installed here, make sure that
# the dev requirements match them.
- run: pip install pytest pytest-cov black flake8 flake8-print flake8-no-implicit-concat
if: matrix.execution != 'hdk_on_native'
if: matrix.execution != 'hdk_on_native' && matrix.execution != 'pandas_on_unidist'
- run: pip install flake8-print jupyter nbformat nbconvert
if: matrix.execution == 'hdk_on_native'
if: matrix.execution == 'hdk_on_native' || matrix.execution == 'pandas_on_unidist'
- run: pip list
if: matrix.execution != 'hdk_on_native'
if: matrix.execution != 'hdk_on_native' && matrix.execution != 'pandas_on_unidist'
- run: |
conda info
conda list
if: matrix.execution == 'hdk_on_native'
if: matrix.execution == 'hdk_on_native' || matrix.execution == 'pandas_on_unidist'
# setup kernel configuration for `pandas_on_unidist` execution with mpi backend
- run: python examples/tutorial/jupyter/execution/${{ matrix.execution }}/setup_kernel.py
if: matrix.execution == 'pandas_on_unidist'
Expand Down
3 changes: 0 additions & 3 deletions .github/workflows/ci-required.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,6 @@ jobs:
asv_bench/benchmarks/__init__.py asv_bench/benchmarks/io/__init__.py \
asv_bench/benchmarks/scalability/__init__.py \
modin/core/io \
modin/experimental/core/execution/ray/implementations/pandas_on_ray \
modin/experimental/core/execution/ray/implementations/pyarrow_on_ray \
modin/pandas/series.py \
modin/core/execution/python \
modin/pandas/dataframe.py \
Expand All @@ -91,7 +89,6 @@ jobs:
python scripts/doc_checker.py modin/experimental/pandas/io.py \
modin/experimental/pandas/__init__.py
- run: python scripts/doc_checker.py modin/core/storage_formats/base
- run: python scripts/doc_checker.py modin/experimental/core/storage_formats/pyarrow
- run: python scripts/doc_checker.py modin/core/storage_formats/pandas
- run: |
python scripts/doc_checker.py \
Expand Down
57 changes: 25 additions & 32 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,17 @@ jobs:
- uses: actions/checkout@v3
- uses: ./.github/actions/python-only
- run: python -m pip install -e ".[all]"
- name: Ensure all engines start up
- name: Ensure Ray and Dask engines start up
run: |
MODIN_ENGINE=dask python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))"
MODIN_ENGINE=ray python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))"
- name: Ensure MPI engine start up
# Install a working MPI implementation beforehand so mpi4py can link to it
run: |
sudo apt install libmpich-dev
python -m pip install -e ".[mpi]"
MODIN_ENGINE=unidist UNIDIST_BACKEND=mpi mpiexec -n 1 python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))"
if: matrix.os == 'ubuntu'

test-internals:
needs: [lint-flake8, lint-black-isort]
Expand Down Expand Up @@ -372,7 +378,15 @@ jobs:
- run: ./.github/workflows/sql_server/set_up_sql_server.sh
# need an extra argument "genv" to set environment variables for mpiexec. We need
# these variables to test writing to the mock s3 filesystem.
- run: mpiexec -n 1 -genv AWS_ACCESS_KEY_ID foobar_key -genv AWS_SECRET_ACCESS_KEY foobar_secret python -m pytest modin/pandas/test/test_io.py --verbose
- uses: nick-fields/retry@v2
# to avoid issues with non-stable `to_csv` tests for unidist on MPI backend.
# for details see: https://github.com/modin-project/modin/pull/6776
with:
timeout_minutes: 15
max_attempts: 3
command: |
conda run --no-capture-output -n modin_on_unidist mpiexec -n 1 -genv AWS_ACCESS_KEY_ID foobar_key \
-genv AWS_SECRET_ACCESS_KEY foobar_secret python -m pytest modin/pandas/test/test_io.py --verbose
- run: mpiexec -n 1 python -m pytest modin/experimental/pandas/test/test_io_exp.py
- run: mpiexec -n 1 python -m pytest modin/experimental/sql/test/test_sql.py
- run: mpiexec -n 1 python -m pytest modin/test/interchange/dataframe_protocol/test_general.py
Expand Down Expand Up @@ -626,6 +640,15 @@ jobs:
if: matrix.os != 'windows'
- run: ${{ matrix.execution.shell-ex }} $PARALLEL modin/numpy/test
- run: ${{ matrix.execution.shell-ex }} -m "not exclude_in_sanity" modin/pandas/test/test_io.py --verbose
if: matrix.execution.name != 'unidist'
- uses: nick-fields/retry@v2
# to avoid issues with non-stable `to_csv` tests for unidist on MPI backend.
# for details see: https://github.com/modin-project/modin/pull/6776
with:
timeout_minutes: 15
max_attempts: 3
command: conda run --no-capture-output -n modin_on_unidist ${{ matrix.execution.shell-ex }} -m "not exclude_in_sanity" modin/pandas/test/test_io.py --verbose
if: matrix.execution.name == 'unidist'
- run: ${{ matrix.execution.shell-ex }} modin/experimental/pandas/test/test_io_exp.py
- run: ${{ matrix.execution.shell-ex }} $PARALLEL modin/test/interchange/dataframe_protocol/test_general.py
- run: ${{ matrix.execution.shell-ex }} $PARALLEL modin/test/interchange/dataframe_protocol/pandas/test_protocol.py
Expand Down Expand Up @@ -668,36 +691,6 @@ jobs:
- run: python -m pytest modin/pandas/test/test_io.py --verbose
- uses: ./.github/actions/upload-coverage

test-pyarrow:
needs: [lint-flake8, lint-black-isort]
runs-on: ubuntu-latest
defaults:
run:
shell: bash -l {0}
strategy:
matrix:
python-version: ["3.9"]
env:
MODIN_STORAGE_FORMAT: pyarrow
MODIN_EXPERIMENTAL: "True"
name: test (pyarrow, python ${{matrix.python-version}})
services:
moto:
image: motoserver/moto
ports:
- 5000:5000
env:
AWS_ACCESS_KEY_ID: foobar_key
AWS_SECRET_ACCESS_KEY: foobar_secret
steps:
- uses: actions/checkout@v3
- uses: ./.github/actions/mamba-env
with:
environment-file: environment-dev.yml
python-version: ${{matrix.python-version}}
- run: sudo apt update && sudo apt install -y libhdf5-dev
- run: python -m pytest modin/pandas/test/test_io.py::TestCsv --verbose

test-spreadsheet:
needs: [lint-flake8, lint-black-isort]
runs-on: ubuntu-latest
Expand Down
22 changes: 16 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,24 +57,30 @@ The charts below show the speedup you get by replacing pandas with Modin based o
Modin can be installed with `pip` on Linux, Windows and MacOS:

```bash
pip install "modin[all]" # (Recommended) Install Modin with all of Modin's currently supported engines.
pip install "modin[all]" # (Recommended) Install Modin with Ray and Dask engines.
```

If you want to install Modin with a specific engine, we recommend:

```bash
pip install "modin[ray]" # Install Modin dependencies and Ray.
pip install "modin[dask]" # Install Modin dependencies and Dask.
pip install "modin[unidist]" # Install Modin dependencies and Unidist.
pip install "modin[mpi]" # Install Modin dependencies and MPI through unidist.
```

To get Modin on MPI through unidist (as of unidist 0.5.0) fully working
it is required to have a working MPI implementation installed beforehand.
Otherwise, installation of `modin[mpi]` may fail. Refer to
[Installing with pip](https://unidist.readthedocs.io/en/latest/installation.html#installing-with-pip)
section of the unidist documentation for more details about installation.

Modin automatically detects which engine(s) you have installed and uses that for scheduling computation.

#### From conda-forge

Installing from [conda forge](https://github.com/conda-forge/modin-feedstock) using `modin-all`
will install Modin and four engines: [Ray](https://github.com/ray-project/ray), [Dask](https://github.com/dask/dask),
[Unidist](https://github.com/modin-project/unidist) and [HDK](https://github.com/intel-ai/hdk).
[MPI through unidist](https://github.com/modin-project/unidist) and [HDK](https://github.com/intel-ai/hdk).

```bash
conda install -c conda-forge modin-all
Expand All @@ -85,10 +91,14 @@ Each engine can also be installed individually (and also as a combination of sev
```bash
conda install -c conda-forge modin-ray # Install Modin dependencies and Ray.
conda install -c conda-forge modin-dask # Install Modin dependencies and Dask.
conda install -c conda-forge modin-unidist # Install Modin dependencies and Unidist.
conda install -c conda-forge modin-mpi # Install Modin dependencies and MPI through unidist.
conda install -c conda-forge modin-hdk # Install Modin dependencies and HDK.
```

Refer to
[Installing with conda](https://unidist.readthedocs.io/en/latest/installation.html#installing-with-conda)
section of the unidist documentation for more details on how to install a specific MPI implementation to run on.

To speed up conda installation we recommend using libmamba solver. To do this install it in a base environment:

```bash
Expand Down Expand Up @@ -119,7 +129,7 @@ export MODIN_ENGINE=unidist # Modin will use Unidist
```

If you want to choose the Unidist engine, you should set the additional environment
variable ``UNIDIST_BACKEND``, because currently Modin only supports Unidist on MPI:
variable ``UNIDIST_BACKEND``. Currently, Modin only supports MPI through unidist:

```bash
export UNIDIST_BACKEND=mpi # Unidist will use MPI backend
Expand All @@ -144,7 +154,7 @@ _Note: You should not change the engine after your first operation with Modin as

#### Which engine should I use?

On Linux, MacOS, and Windows you can install and use either Ray, Dask or Unidist. There is no knowledge required
On Linux, MacOS, and Windows you can install and use either Ray, Dask or MPI through unidist. There is no knowledge required
to use either of these engines as Modin abstracts away all of the complexity, so feel
free to pick either!

Expand Down
15 changes: 11 additions & 4 deletions asv_bench/benchmarks/scalability/scalability_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,20 @@
"""These benchmarks are supposed to be run only for modin, since they do not make sense for pandas."""

import modin.pandas as pd
from modin.pandas.utils import from_pandas

try:
from modin.utils import to_numpy, to_pandas
from modin.pandas.io import from_pandas
except ImportError:
# This provides compatibility with older versions of the Modin, allowing us to test old commits.
from modin.pandas.utils import to_pandas
from modin.pandas.utils import from_pandas

try:
from modin.pandas.io import to_numpy, to_pandas
except ImportError:
try:
from modin.utils import to_numpy, to_pandas
except ImportError:
# This provides compatibility with older versions of the Modin, allowing us to test old commits.
from modin.pandas.utils import to_pandas

import pandas

Expand Down
2 changes: 1 addition & 1 deletion asv_bench/benchmarks/utils/compatibility.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,4 @@
assert ASV_USE_IMPL in ("modin", "pandas")
assert ASV_DATASET_SIZE in ("big", "small")
assert ASV_USE_ENGINE in ("ray", "dask", "python", "native", "unidist")
assert ASV_USE_STORAGE_FORMAT in ("pandas", "hdk", "pyarrow")
assert ASV_USE_STORAGE_FORMAT in ("pandas", "hdk")
20 changes: 19 additions & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,15 @@ def noop_decorator(*args, **kwargs):
ray.remote = noop_decorator

# fake modules if they're missing
for mod_name in ("cudf", "cupy", "pyarrow.gandiva", "pyhdk", "pyhdk.hdk", "xgboost"):
for mod_name in (
"cudf",
"cupy",
"pyhdk",
"pyhdk.hdk",
"xgboost",
"unidist",
"unidist.config",
):
try:
__import__(mod_name)
except ImportError:
Expand All @@ -52,6 +60,16 @@ def noop_decorator(*args, **kwargs):
sys.modules["pyhdk"].__version__ = "999"
if not hasattr(sys.modules["xgboost"], "Booster"):
sys.modules["xgboost"].Booster = type("Booster", (object,), {})
if not hasattr(sys.modules["unidist"], "remote"):
sys.modules["unidist"].remote = noop_decorator
if not hasattr(sys.modules["unidist"], "core"):
sys.modules["unidist"].core = type("core", (object,), {})
if not hasattr(sys.modules["unidist"].core, "base"):
sys.modules["unidist"].core.base = type("base", (object,), {})
if not hasattr(sys.modules["unidist"].core.base, "object_ref"):
sys.modules["unidist"].core.base.object_ref = type("object_ref", (object,), {})
if not hasattr(sys.modules["unidist"].core.base.object_ref, "ObjectRef"):
sys.modules["unidist"].core.base.object_ref.ObjectRef = type("ObjectRef", (object,), {})

sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
import modin
Expand Down
Loading

0 comments on commit 8728894

Please sign in to comment.