Skip to content

Commit

Permalink
REFACTOR-modin-project#6815: move experimental parsers into 'modin.ex…
Browse files Browse the repository at this point in the history
…perimental' folder

Signed-off-by: Anatoly Myachev <[email protected]>
  • Loading branch information
anmyachev committed Dec 8, 2023
1 parent c3a4f78 commit d73f0f8
Show file tree
Hide file tree
Showing 9 changed files with 384 additions and 38 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,6 @@
SQLDispatcher,
)
from modin.core.storage_formats.pandas.parsers import (
ExperimentalCustomTextParser,
ExperimentalPandasPickleParser,
PandasCSVGlobParser,
PandasCSVParser,
PandasExcelParser,
PandasFeatherParser,
Expand All @@ -49,6 +46,11 @@
ExperimentalPickleDispatcher,
ExperimentalSQLDispatcher,
)
from modin.experimental.core.storage_formats.pandas.parsers import (
ExperimentalCustomTextParser,
ExperimentalPandasPickleParser,
PandasCSVGlobParser,
)


class PandasOnDaskIO(BaseIO):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,6 @@
SQLDispatcher,
)
from modin.core.storage_formats.pandas.parsers import (
ExperimentalCustomTextParser,
ExperimentalPandasPickleParser,
PandasCSVGlobParser,
PandasCSVParser,
PandasExcelParser,
PandasFeatherParser,
Expand All @@ -48,6 +45,11 @@
ExperimentalPickleDispatcher,
ExperimentalSQLDispatcher,
)
from modin.experimental.core.storage_formats.pandas.parsers import (
ExperimentalCustomTextParser,
ExperimentalPandasPickleParser,
PandasCSVGlobParser,
)

from ..dataframe import PandasOnRayDataframe
from ..partitioning import PandasOnRayDataframePartition
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,6 @@
SQLDispatcher,
)
from modin.core.storage_formats.pandas.parsers import (
ExperimentalCustomTextParser,
ExperimentalPandasPickleParser,
PandasCSVGlobParser,
PandasCSVParser,
PandasExcelParser,
PandasFeatherParser,
Expand All @@ -47,6 +44,11 @@
ExperimentalPickleDispatcher,
ExperimentalSQLDispatcher,
)
from modin.experimental.core.storage_formats.pandas.parsers import (
ExperimentalCustomTextParser,
ExperimentalPandasPickleParser,
PandasCSVGlobParser,
)

from ..dataframe import PandasOnUnidistDataframe
from ..partitioning import PandasOnUnidistDataframePartition
Expand Down
29 changes: 0 additions & 29 deletions modin/core/storage_formats/pandas/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,35 +440,6 @@ def parse(chunks, **kwargs):
]


@doc(_doc_pandas_parser_class, data_type="pickled pandas objects")
class ExperimentalPandasPickleParser(PandasParser):
@staticmethod
@doc(_doc_parse_func, parameters=_doc_parse_parameters_common)
def parse(fname, **kwargs):
warnings.filterwarnings("ignore")
num_splits = 1
single_worker_read = kwargs.pop("single_worker_read", None)
df = pandas.read_pickle(fname, **kwargs)
if single_worker_read:
return df
assert isinstance(
df, pandas.DataFrame
), f"Pickled obj type: [{type(df)}] in [{fname}]; works only with pandas.DataFrame"

length = len(df)
width = len(df.columns)

return _split_result_for_readers(1, num_splits, df) + [length, width]


@doc(_doc_pandas_parser_class, data_type="custom text")
class ExperimentalCustomTextParser(PandasParser):
@staticmethod
@doc(_doc_parse_func, parameters=_doc_parse_parameters_common)
def parse(fname, **kwargs):
return PandasParser.generic_parse(fname, **kwargs)


@doc(_doc_pandas_parser_class, data_type="tables with fixed-width formatted lines")
class PandasFWFParser(PandasParser):
@staticmethod
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

"""
Module houses experimental IO classes and parser functions needed for these classes.
Any function or class can be considered experimental API if it is not strictly replicating existent
Query Compiler API, even if it is only extending the API.
"""

from modin.core.execution.dask.common import DaskWrapper
from modin.core.execution.dask.implementations.pandas_on_dask.dataframe import (
PandasOnDaskDataframe,
)
from modin.core.execution.dask.implementations.pandas_on_dask.io import PandasOnDaskIO
from modin.core.execution.dask.implementations.pandas_on_dask.partitioning import (
PandasOnDaskDataframePartition,
)
from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler
from modin.experimental.core.io import (
ExperimentalCSVGlobDispatcher,
ExperimentalCustomTextDispatcher,
ExperimentalPickleDispatcher,
ExperimentalSQLDispatcher,
)
from modin.experimental.core.storage_formats.pandas.parsers import (
ExperimentalCustomTextParser,
ExperimentalPandasPickleParser,
PandasCSVGlobParser,
)


class ExperimentalPandasOnDaskIO(PandasOnDaskIO):
"""
Class for handling experimental IO functionality with pandas storage format and Dask engine.
``ExperimentalPandasOnDaskIO`` inherits some util functions and unmodified IO functions
from ``PandasOnDaskIO`` class.
"""

build_args = dict(
frame_partition_cls=PandasOnDaskDataframePartition,
query_compiler_cls=PandasQueryCompiler,
frame_cls=PandasOnDaskDataframe,
base_io=PandasOnDaskIO,
)

def __make_read(*classes, build_args=build_args):
# used to reduce code duplication
return type("", (DaskWrapper, *classes), build_args).read

def __make_write(*classes, build_args=build_args):
# used to reduce code duplication
return type("", (DaskWrapper, *classes), build_args).write

read_csv_glob = __make_read(PandasCSVGlobParser, ExperimentalCSVGlobDispatcher)
read_pickle_distributed = __make_read(
ExperimentalPandasPickleParser, ExperimentalPickleDispatcher
)
to_pickle_distributed = __make_write(ExperimentalPickleDispatcher)
read_custom_text = __make_read(
ExperimentalCustomTextParser, ExperimentalCustomTextDispatcher
)
read_sql = __make_read(ExperimentalSQLDispatcher)

del __make_read # to not pollute class namespace
del __make_write # to not pollute class namespace
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

"""
Module houses experimental IO classes and parser functions needed for these classes.
Any function or class can be considered experimental API if it is not strictly replicating existent
Query Compiler API, even if it is only extending the API.
"""

from modin.core.execution.ray.common import RayWrapper
from modin.core.execution.ray.implementations.pandas_on_ray.dataframe import (
PandasOnRayDataframe,
)
from modin.core.execution.ray.implementations.pandas_on_ray.io import PandasOnRayIO
from modin.core.execution.ray.implementations.pandas_on_ray.partitioning import (
PandasOnRayDataframePartition,
)
from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler
from modin.experimental.core.io import (
ExperimentalCSVGlobDispatcher,
ExperimentalCustomTextDispatcher,
ExperimentalPickleDispatcher,
ExperimentalSQLDispatcher,
)
from modin.experimental.core.storage_formats.pandas.parsers import (
ExperimentalCustomTextParser,
ExperimentalPandasPickleParser,
PandasCSVGlobParser,
)


class ExperimentalPandasOnRayIO(PandasOnRayIO):
"""
Class for handling experimental IO functionality with pandas storage format and Ray engine.
``ExperimentalPandasOnRayIO`` inherits some util functions and unmodified IO functions
from ``PandasOnRayIO`` class.
"""

build_args = dict(
frame_partition_cls=PandasOnRayDataframePartition,
query_compiler_cls=PandasQueryCompiler,
frame_cls=PandasOnRayDataframe,
base_io=PandasOnRayIO,
)

def __make_read(*classes, build_args=build_args):
# used to reduce code duplication
return type("", (RayWrapper, *classes), build_args).read

def __make_write(*classes, build_args=build_args):
# used to reduce code duplication
return type("", (RayWrapper, *classes), build_args).write

read_csv_glob = __make_read(PandasCSVGlobParser, ExperimentalCSVGlobDispatcher)
read_pickle_distributed = __make_read(
ExperimentalPandasPickleParser, ExperimentalPickleDispatcher
)
to_pickle_distributed = __make_write(ExperimentalPickleDispatcher)
read_custom_text = __make_read(
ExperimentalCustomTextParser, ExperimentalCustomTextDispatcher
)
read_sql = __make_read(ExperimentalSQLDispatcher)

del __make_read # to not pollute class namespace
del __make_write # to not pollute class namespace
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

"""
Module houses experimental IO classes and parser functions needed for these classes.
Any function or class can be considered experimental API if it is not strictly replicating existent
Query Compiler API, even if it is only extending the API.
"""

from modin.core.execution.unidist.common import UnidistWrapper
from modin.core.execution.unidist.implementations.pandas_on_unidist.dataframe import (
PandasOnUnidistDataframe,
)
from modin.core.execution.unidist.implementations.pandas_on_unidist.io import (
PandasOnUnidistIO,
)
from modin.core.execution.unidist.implementations.pandas_on_unidist.partitioning import (
PandasOnUnidistDataframePartition,
)
from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler
from modin.experimental.core.io import (
ExperimentalCSVGlobDispatcher,
ExperimentalCustomTextDispatcher,
ExperimentalPickleDispatcher,
ExperimentalSQLDispatcher,
)
from modin.experimental.core.storage_formats.pandas.parsers import (
ExperimentalCustomTextParser,
ExperimentalPandasPickleParser,
PandasCSVGlobParser,
)


class ExperimentalPandasOnUnidistIO(PandasOnUnidistIO):
"""
Class for handling experimental IO functionality with pandas storage format and unidist engine.
``ExperimentalPandasOnUnidistIO`` inherits some util functions and unmodified IO functions
from ``PandasOnUnidistIO`` class.
"""

build_args = dict(
frame_partition_cls=PandasOnUnidistDataframePartition,
query_compiler_cls=PandasQueryCompiler,
frame_cls=PandasOnUnidistDataframe,
base_io=PandasOnUnidistIO,
)

def __make_read(*classes, build_args=build_args):
# used to reduce code duplication
return type("", (UnidistWrapper, *classes), build_args).read

def __make_write(*classes, build_args=build_args):
# used to reduce code duplication
return type("", (UnidistWrapper, *classes), build_args).write

read_csv_glob = __make_read(PandasCSVGlobParser, ExperimentalCSVGlobDispatcher)
read_pickle_distributed = __make_read(
ExperimentalPandasPickleParser, ExperimentalPickleDispatcher
)
to_pickle_distributed = __make_write(ExperimentalPickleDispatcher)
read_custom_text = __make_read(
ExperimentalCustomTextParser, ExperimentalCustomTextDispatcher
)
read_sql = __make_read(ExperimentalSQLDispatcher)

del __make_read # to not pollute class namespace
del __make_write # to not pollute class namespace
14 changes: 14 additions & 0 deletions modin/experimental/core/storage_formats/pandas/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

"""The module represents the query compiler level for the pandas storage format (experimental)."""
Loading

0 comments on commit d73f0f8

Please sign in to comment.