From fedb71f7ecfefcb64e642ecf37f653c4088db886 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Sun, 22 Dec 2024 11:30:18 +0000 Subject: [PATCH] feat(python): Support loading data from multiple Excel/ODS workbooks --- py-polars/polars/_typing.py | 2 +- py-polars/polars/io/avro.py | 5 +- py-polars/polars/io/csv/functions.py | 10 +- py-polars/polars/io/ipc/functions.py | 15 +- py-polars/polars/io/json/read.py | 5 +- py-polars/polars/io/ndjson.py | 5 +- py-polars/polars/io/parquet/functions.py | 13 +- py-polars/polars/io/spreadsheet/functions.py | 171 ++++++++++++------- py-polars/tests/unit/io/test_spreadsheet.py | 41 ++++- 9 files changed, 171 insertions(+), 96 deletions(-) diff --git a/py-polars/polars/_typing.py b/py-polars/polars/_typing.py index ea6284cda9c9..88a0882e5679 100644 --- a/py-polars/polars/_typing.py +++ b/py-polars/polars/_typing.py @@ -299,7 +299,7 @@ def fetchmany(self, *args: Any, **kwargs: Any) -> Any: # LazyFrame engine selection EngineType: TypeAlias = Union[Literal["cpu", "gpu"], "GPUEngine"] -ScanSource: TypeAlias = Union[ +FileSource: TypeAlias = Union[ str, Path, IO[bytes], diff --git a/py-polars/polars/io/avro.py b/py-polars/polars/io/avro.py index 087c7660dc11..25baa5be3147 100644 --- a/py-polars/polars/io/avro.py +++ b/py-polars/polars/io/avro.py @@ -29,9 +29,8 @@ def read_avro( source Path to a file or a file-like object (by "file-like object" we refer to objects that have a `read()` method, such as a file handler like the builtin `open` - function, or a `BytesIO` instance). - For file-like objects, - stream position may not be updated accordingly after reading. + function, or a `BytesIO` instance). For file-like objects, the stream position + may not be updated accordingly after reading. columns Columns to select. Accepts a list of column indices (starting at zero) or a list of column names. diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index cd543fc659cb..3296af2fdaa7 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -90,9 +90,8 @@ def read_csv( Path to a file or a file-like object (by "file-like object" we refer to objects that have a `read()` method, such as a file handler like the builtin `open` function, or a `BytesIO` instance). If `fsspec` is installed, it will be used - to open remote files. - For file-like objects, - stream position may not be updated accordingly after reading. + to open remote files. For file-like objects, the stream position may not be + updated accordingly after reading. has_header Indicate if the first row of the dataset is a header or not. If set to False, column names will be autogenerated in the following format: `column_x`, with @@ -764,9 +763,8 @@ def read_csv_batched( Path to a file or a file-like object (by "file-like object" we refer to objects that have a `read()` method, such as a file handler like the builtin `open` function, or a `BytesIO` instance). If `fsspec` is installed, it will be used - to open remote files. - For file-like objects, - stream position may not be updated accordingly after reading. + to open remote files. For file-like objects, the stream position may not be + updated accordingly after reading. has_header Indicate if the first row of the dataset is a header or not. If set to False, column names will be autogenerated in the following format: `column_x`, with diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py index b8af12ae8806..984ec54d966f 100644 --- a/py-polars/polars/io/ipc/functions.py +++ b/py-polars/polars/io/ipc/functions.py @@ -62,9 +62,8 @@ def read_ipc( Path to a file or a file-like object (by "file-like object" we refer to objects that have a `read()` method, such as a file handler like the builtin `open` function, or a `BytesIO` instance). If `fsspec` is installed, it will be used - to open remote files. - For file-like objects, - stream position may not be updated accordingly after reading. + to open remote files. For file-like objects, the stream position may not be + updated accordingly after reading. columns Columns to select. Accepts a list of column indices (starting at zero) or a list of column names. @@ -241,9 +240,8 @@ def read_ipc_stream( Path to a file or a file-like object (by "file-like object" we refer to objects that have a `read()` method, such as a file handler like the builtin `open` function, or a `BytesIO` instance). If `fsspec` is installed, it will be used - to open remote files. - For file-like objects, - stream position may not be updated accordingly after reading. + to open remote files. For file-like objects, the stream position may not be + updated accordingly after reading. columns Columns to select. Accepts a list of column indices (starting at zero) or a list of column names. @@ -331,9 +329,8 @@ def read_ipc_schema(source: str | Path | IO[bytes] | bytes) -> dict[str, DataTyp source Path to a file or a file-like object (by "file-like object" we refer to objects that have a `read()` method, such as a file handler like the builtin `open` - function, or a `BytesIO` instance). - For file-like objects, - stream position may not be updated accordingly after reading. + function, or a `BytesIO` instance). For file-like objects, the stream position + may not be updated accordingly after reading. Returns ------- diff --git a/py-polars/polars/io/json/read.py b/py-polars/polars/io/json/read.py index ef7c5c9494b1..79862f80feb5 100644 --- a/py-polars/polars/io/json/read.py +++ b/py-polars/polars/io/json/read.py @@ -34,9 +34,8 @@ def read_json( source Path to a file or a file-like object (by "file-like object" we refer to objects that have a `read()` method, such as a file handler like the builtin `open` - function, or a `BytesIO` instance). - For file-like objects, - stream position may not be updated accordingly after reading. + function, or a `BytesIO` instance). For file-like objects, the stream position + may not be updated accordingly after reading. schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict The DataFrame schema may be declared in several ways: diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py index 7da4635408d1..445889712244 100644 --- a/py-polars/polars/io/ndjson.py +++ b/py-polars/polars/io/ndjson.py @@ -51,9 +51,8 @@ def read_ndjson( source Path to a file or a file-like object (by "file-like object" we refer to objects that have a `read()` method, such as a file handler like the builtin `open` - function, or a `BytesIO` instance). - For file-like objects, - stream position may not be updated accordingly after reading. + function, or a `BytesIO` instance). For file-like objects, the stream position + may not be updated accordingly after reading. schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict The DataFrame schema may be declared in several ways: diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py index 0cc91a94693f..7080738e2e84 100644 --- a/py-polars/polars/io/parquet/functions.py +++ b/py-polars/polars/io/parquet/functions.py @@ -31,14 +31,14 @@ from typing import Literal from polars import DataFrame, DataType, LazyFrame - from polars._typing import ParallelStrategy, ScanSource, SchemaDict + from polars._typing import FileSource, ParallelStrategy, SchemaDict from polars.io.cloud import CredentialProviderFunction @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4") @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4") def read_parquet( - source: ScanSource, + source: FileSource, *, columns: list[int] | list[str] | None = None, n_rows: int | None = None, @@ -74,7 +74,7 @@ def read_parquet( File-like objects are supported (by "file-like object" we refer to objects that have a `read()` method, such as a file handler like the builtin `open` - function, or a `BytesIO` instance) For file-like objects, stream position + function, or a `BytesIO` instance). For file-like objects, the stream position may not be updated accordingly after reading. columns Columns to select. Accepts a list of column indices (starting at zero) or a list @@ -304,9 +304,8 @@ def read_parquet_schema(source: str | Path | IO[bytes] | bytes) -> dict[str, Dat source Path to a file or a file-like object (by "file-like object" we refer to objects that have a `read()` method, such as a file handler like the builtin `open` - function, or a `BytesIO` instance). - For file-like objects, - stream position may not be updated accordingly after reading. + function, or a `BytesIO` instance). For file-like objects, the stream position + may not be updated accordingly after reading. Returns ------- @@ -322,7 +321,7 @@ def read_parquet_schema(source: str | Path | IO[bytes] | bytes) -> dict[str, Dat @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4") @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4") def scan_parquet( - source: ScanSource, + source: FileSource, *, n_rows: int | None = None, row_index_name: str | None = None, diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py index ecce10d34d72..700d3e5cea13 100644 --- a/py-polars/polars/io/spreadsheet/functions.py +++ b/py-polars/polars/io/spreadsheet/functions.py @@ -4,7 +4,9 @@ import warnings from collections.abc import Sequence from datetime import time +from glob import glob from io import BufferedReader, BytesIO, StringIO, TextIOWrapper +from os import PathLike from pathlib import Path from typing import IO, TYPE_CHECKING, Any, Callable, NoReturn, overload @@ -34,18 +36,38 @@ NoDataError, ParameterCollisionError, ) +from polars.functions import concat from polars.io._utils import looks_like_url, process_file_url from polars.io.csv.functions import read_csv if TYPE_CHECKING: from typing import Literal - from polars._typing import ExcelSpreadsheetEngine, SchemaDict + from polars._typing import ExcelSpreadsheetEngine, FileSource, SchemaDict + + +def _sources( + source: FileSource, +) -> tuple[Any, bool]: + read_multiple_workbooks = True + sources: list[Any] = [] + + if not isinstance(source, Sequence) or isinstance(source, str): + read_multiple_workbooks = False + source = [source] # type: ignore[assignment] + + for src in source: # type: ignore[union-attr] + if isinstance(src, (str, PathLike)) and not Path(src).exists(): + sources.extend(glob(str(src), recursive=True)) # noqa: PTH207 + else: + sources.append(src) + + return sources, read_multiple_workbooks @overload def read_excel( - source: str | Path | IO[bytes] | bytes, + source: FileSource, *, sheet_id: None = ..., sheet_name: str, @@ -63,7 +85,7 @@ def read_excel( @overload def read_excel( - source: str | Path | IO[bytes] | bytes, + source: FileSource, *, sheet_id: None = ..., sheet_name: None = ..., @@ -81,7 +103,7 @@ def read_excel( @overload def read_excel( - source: str | Path | IO[bytes] | bytes, + source: FileSource, *, sheet_id: int, sheet_name: str, @@ -101,7 +123,7 @@ def read_excel( # Literal[0] overlaps with the return value for other integers @overload # type: ignore[overload-overlap] def read_excel( - source: str | Path | IO[bytes] | bytes, + source: FileSource, *, sheet_id: Literal[0] | Sequence[int], sheet_name: None = ..., @@ -119,7 +141,7 @@ def read_excel( @overload def read_excel( - source: str | Path | IO[bytes] | bytes, + source: FileSource, *, sheet_id: int, sheet_name: None = ..., @@ -137,7 +159,7 @@ def read_excel( @overload def read_excel( - source: str | Path | IO[bytes] | bytes, + source: FileSource, *, sheet_id: None, sheet_name: list[str] | tuple[str], @@ -156,7 +178,7 @@ def read_excel( @deprecate_renamed_parameter("xlsx2csv_options", "engine_options", version="0.20.6") @deprecate_renamed_parameter("read_csv_options", "read_options", version="0.20.7") def read_excel( - source: str | Path | IO[bytes] | bytes, + source: FileSource, *, sheet_id: int | Sequence[int] | None = None, sheet_name: str | list[str] | tuple[str] | None = None, @@ -173,6 +195,8 @@ def read_excel( """ Read Excel spreadsheet data into a DataFrame. + .. versionadded:: 1.18 + Support loading data from a list (or glob pattern) of multiple workbooks. .. versionchanged:: 1.0 Default engine is now "calamine" (was "xlsx2csv"). .. versionadded:: 0.20.6 @@ -183,18 +207,17 @@ def read_excel( Parameters ---------- source - Path to a file or a file-like object (by "file-like object" we refer to objects - that have a `read()` method, such as a file handler like the builtin `open` - function, or a `BytesIO` instance). - For file-like objects, - stream position may not be updated accordingly after reading. + Path(s) to a file or a file-like object (by "file-like object" we refer to + objects that have a `read()` method, such as a file handler like the builtin + `open` function, or a `BytesIO` instance). For file-like objects, the stream + position may not be updated after reading. sheet_id Sheet number(s) to convert (set `0` to load all sheets as DataFrames) and return a `{sheetname:frame,}` dict. (Defaults to `1` if neither this nor `sheet_name` are specified). Can also take a sequence of sheet numbers. sheet_name - Sheet name(s) to convert; cannot be used in conjunction with `sheet_id`. If more - than one is given then a `{sheetname:frame,}` dict is returned. + Sheet name(s) to convert; cannot be used in conjunction with `sheet_id`. If + more than one is given then a `{sheetname:frame,}` dict is returned. engine : {'calamine', 'xlsx2csv', 'openpyxl'} Library used to parse the spreadsheet file; defaults to "calamine". @@ -297,25 +320,33 @@ def read_excel( ... read_options={"has_header": False, "new_columns": ["a", "b", "c"]}, ... ) # doctest: +SKIP """ - return _read_spreadsheet( - sheet_id, - sheet_name, - source=source, - engine=engine, - engine_options=engine_options, - read_options=read_options, - schema_overrides=schema_overrides, - infer_schema_length=infer_schema_length, - raise_if_empty=raise_if_empty, - has_header=has_header, - columns=columns, - drop_empty_rows=drop_empty_rows, - ) + sources, read_multiple_workbooks = _sources(source) + frames = [ + _read_spreadsheet( + src, + sheet_id=sheet_id, + sheet_name=sheet_name, + engine=engine, + engine_options=engine_options, + read_options=read_options, + schema_overrides=schema_overrides, + infer_schema_length=infer_schema_length, + raise_if_empty=raise_if_empty, + has_header=has_header, + columns=columns, + drop_empty_rows=drop_empty_rows, + read_multiple_workbooks=read_multiple_workbooks, + ) + for src in sources + ] + if read_multiple_workbooks: + return concat(frames, how="vertical_relaxed") # type: ignore[type-var] + return frames[0] @overload def read_ods( - source: str | Path | IO[bytes] | bytes, + source: FileSource, *, sheet_id: None = ..., sheet_name: str, @@ -330,7 +361,7 @@ def read_ods( @overload def read_ods( - source: str | Path | IO[bytes] | bytes, + source: FileSource, *, sheet_id: None = ..., sheet_name: None = ..., @@ -345,7 +376,7 @@ def read_ods( @overload def read_ods( - source: str | Path | IO[bytes] | bytes, + source: FileSource, *, sheet_id: int, sheet_name: str, @@ -360,7 +391,7 @@ def read_ods( @overload # type: ignore[overload-overlap] def read_ods( - source: str | Path | IO[bytes] | bytes, + source: FileSource, *, sheet_id: Literal[0] | Sequence[int], sheet_name: None = ..., @@ -375,7 +406,7 @@ def read_ods( @overload def read_ods( - source: str | Path | IO[bytes] | bytes, + source: FileSource, *, sheet_id: int, sheet_name: None = ..., @@ -390,7 +421,7 @@ def read_ods( @overload def read_ods( - source: str | Path | IO[bytes] | bytes, + source: FileSource, *, sheet_id: None, sheet_name: list[str] | tuple[str], @@ -404,7 +435,7 @@ def read_ods( def read_ods( - source: str | Path | IO[bytes] | bytes, + source: FileSource, *, sheet_id: int | Sequence[int] | None = None, sheet_name: str | list[str] | tuple[str] | None = None, @@ -423,9 +454,8 @@ def read_ods( source Path to a file or a file-like object (by "file-like object" we refer to objects that have a `read()` method, such as a file handler like the builtin `open` - function, or a `BytesIO` instance). - For file-like objects, - stream position may not be updated accordingly after reading. + function, or a `BytesIO` instance). For file-like objects, the stream position + may not be updated accordingly after reading. sheet_id Sheet number(s) to convert, starting from 1 (set `0` to load *all* worksheets as DataFrames) and return a `{sheetname:frame,}` dict. (Defaults to `1` if @@ -480,27 +510,35 @@ def read_ods( ... raise_if_empty=False, ... ) # doctest: +SKIP """ - return _read_spreadsheet( - sheet_id, - sheet_name, - source=source, - engine="calamine", - engine_options={}, - read_options=None, - schema_overrides=schema_overrides, - infer_schema_length=infer_schema_length, - raise_if_empty=raise_if_empty, - drop_empty_rows=drop_empty_rows, - has_header=has_header, - columns=columns, - ) + sources, read_multiple_workbooks = _sources(source) + frames = [ + _read_spreadsheet( + src, + sheet_id=sheet_id, + sheet_name=sheet_name, + engine="calamine", + engine_options={}, + read_options=None, + schema_overrides=schema_overrides, + infer_schema_length=infer_schema_length, + raise_if_empty=raise_if_empty, + drop_empty_rows=drop_empty_rows, + has_header=has_header, + columns=columns, + read_multiple_workbooks=read_multiple_workbooks, + ) + for src in sources + ] + if read_multiple_workbooks: + return concat(frames, how="vertical_relaxed") # type: ignore[type-var] + return frames[0] def _read_spreadsheet( - sheet_id: int | Sequence[int] | None, - sheet_name: str | list[str] | tuple[str] | None, - *, source: str | Path | IO[bytes] | bytes, + *, + sheet_id: int | Sequence[int] | None, + sheet_name: str | Sequence[str] | None, engine: ExcelSpreadsheetEngine, engine_options: dict[str, Any] | None = None, read_options: dict[str, Any] | None = None, @@ -510,6 +548,7 @@ def _read_spreadsheet( has_header: bool = True, raise_if_empty: bool = True, drop_empty_rows: bool = True, + read_multiple_workbooks: bool = False, ) -> pl.DataFrame | dict[str, pl.DataFrame]: if isinstance(source, (str, Path)): source = normalize_filepath(source) @@ -532,7 +571,13 @@ def _read_spreadsheet( ) try: # parse data from the indicated sheet(s) - sheet_names, return_multi = _get_sheet_names(sheet_id, sheet_name, worksheets) + sheet_names, return_multiple_sheets = _get_sheet_names( + sheet_id, sheet_name, worksheets + ) + if read_multiple_workbooks and return_multiple_sheets: + msg = "cannot return multiple sheets from multiple workbooks" + raise ValueError(msg) + parsed_sheets = { name: reader_fn( parser=parser, @@ -554,7 +599,7 @@ def _read_spreadsheet( msg = f"no matching sheets found when `sheet_{param}` is {value!r}" raise ValueError(msg) - if return_multi: + if return_multiple_sheets: return parsed_sheets return next(iter(parsed_sheets.values())) @@ -614,7 +659,7 @@ def _get_read_options( def _get_sheet_names( sheet_id: int | Sequence[int] | None, - sheet_name: str | list[str] | tuple[str] | None, + sheet_name: str | Sequence[str] | None, worksheets: list[dict[str, Any]], ) -> tuple[list[str], bool]: """Establish sheets to read; indicate if we are returning a dict frames.""" @@ -625,12 +670,12 @@ def _get_sheet_names( sheet_names = [] if sheet_id is None and sheet_name is None: sheet_names.append(worksheets[0]["name"]) - return_multi = False + return_multiple_sheets = False elif sheet_id == 0: sheet_names.extend(ws["name"] for ws in worksheets) - return_multi = True + return_multiple_sheets = True else: - return_multi = ( + return_multiple_sheets = ( (isinstance(sheet_name, Sequence) and not isinstance(sheet_name, str)) or isinstance(sheet_id, Sequence) or sheet_id == 0 @@ -656,7 +701,7 @@ def _get_sheet_names( msg = f"no matching sheet found when `sheet_id` is {idx}" raise ValueError(msg) sheet_names.append(name) - return sheet_names, return_multi + return sheet_names, return_multiple_sheets def _initialise_spreadsheet_parser( diff --git a/py-polars/tests/unit/io/test_spreadsheet.py b/py-polars/tests/unit/io/test_spreadsheet.py index fadefa246c2c..2bfc830c4712 100644 --- a/py-polars/tests/unit/io/test_spreadsheet.py +++ b/py-polars/tests/unit/io/test_spreadsheet.py @@ -139,7 +139,7 @@ def test_read_spreadsheet( (pl.read_ods, "path_ods", {}), ], ) -def test_read_excel_multi_sheets( +def test_read_excel_multiple_worksheets( read_spreadsheet: Callable[..., dict[str, pl.DataFrame]], source: str, params: dict[str, str], @@ -168,6 +168,45 @@ def test_read_excel_multi_sheets( assert_frame_equal(frames["test2"], expected2) +@pytest.mark.parametrize( + ("read_spreadsheet", "source", "params"), + [ + # xls file + (pl.read_excel, "path_xls", {"engine": "calamine"}), + # xlsx file + (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}), + (pl.read_excel, "path_xlsx", {"engine": "openpyxl"}), + (pl.read_excel, "path_xlsx", {"engine": "calamine"}), + # xlsb file (binary) + (pl.read_excel, "path_xlsb", {"engine": "calamine"}), + # open document + (pl.read_ods, "path_ods", {}), + ], +) +def test_read_excel_multiple_workbooks( + read_spreadsheet: Callable[..., pl.DataFrame], + source: str, + params: dict[str, str], + request: pytest.FixtureRequest, +) -> None: + spreadsheet_path = request.getfixturevalue(source) + + df = read_spreadsheet( + [ + spreadsheet_path, + spreadsheet_path, + spreadsheet_path, + ], + sheet_id=None, + sheet_name="test1", + **params, + ) + expected = pl.DataFrame( + {"hello": ["Row 1", "Row 2", "Row 1", "Row 2", "Row 1", "Row 2"]} + ) + assert_frame_equal(df, expected) + + @pytest.mark.parametrize( ("read_spreadsheet", "source", "params"), [