From 99472416b4b330768b72ed0fee792720434a5f7a Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Sun, 5 Nov 2023 21:42:10 +0400 Subject: [PATCH] fix(python): return frames from `read_excel` in the originally specified order (#12243) --- py-polars/polars/io/spreadsheet/functions.py | 76 +++++++++++++------- py-polars/tests/unit/io/test_spreadsheet.py | 8 +-- 2 files changed, 55 insertions(+), 29 deletions(-) diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py index 96aaf78d90dd..0d94f5ba9bad 100644 --- a/py-polars/polars/io/spreadsheet/functions.py +++ b/py-polars/polars/io/spreadsheet/functions.py @@ -403,11 +403,6 @@ def _read_spreadsheet( *, raise_if_empty: bool = True, ) -> pl.DataFrame | dict[str, pl.DataFrame]: - if sheet_id is not None and sheet_name is not None: - raise ValueError( - f"cannot specify both `sheet_name` ({sheet_name!r}) and `sheet_id` ({sheet_id!r})" - ) - if isinstance(source, (str, Path)): source = normalize_filepath(source) @@ -421,27 +416,9 @@ def _read_spreadsheet( reader_fn, parser, worksheets = _initialise_spreadsheet_parser( engine, source, engine_options or {} ) - - # determine which named worksheets to read - if sheet_id is None and sheet_name is None: - sheet_names = [worksheets[0]["name"]] - return_multi = False - else: - return_multi = ( - sheet_id == 0 - or isinstance(sheet_id, Sequence) - or (isinstance(sheet_name, Sequence) and not isinstance(sheet_name, str)) - ) - ids = (sheet_id,) if isinstance(sheet_id, int) else sheet_id or () - names = (sheet_name,) if isinstance(sheet_name, str) else sheet_name or () - sheet_names = [ - ws["name"] - for ws in worksheets - if (sheet_id == 0 or ws["index"] in ids or ws["name"] in names) - ] - - # read data from the indicated sheet(s) try: + # parse data from the indicated sheet(s) + sheet_names, return_multi = _get_sheet_names(sheet_id, sheet_name, worksheets) parsed_sheets = { name: reader_fn( parser=parser, @@ -465,6 +442,55 @@ def _read_spreadsheet( return next(iter(parsed_sheets.values())) +def _get_sheet_names( + sheet_id: int | Sequence[int] | None, + sheet_name: str | list[str] | tuple[str] | None, + worksheets: list[dict[str, Any]], +) -> tuple[list[str], bool]: + """Establish sheets to read; indicate if we are returning a dict frames.""" + if sheet_id is not None and sheet_name is not None: + raise ValueError( + f"cannot specify both `sheet_name` ({sheet_name!r}) and `sheet_id` ({sheet_id!r})" + ) + sheet_names = [] + if sheet_id is None and sheet_name is None: + sheet_names.append(worksheets[0]["name"]) + return_multi = False + elif sheet_id == 0: + sheet_names.extend(ws["name"] for ws in worksheets) + return_multi = True + else: + return_multi = ( + (isinstance(sheet_name, Sequence) and not isinstance(sheet_name, str)) + or isinstance(sheet_id, Sequence) + or sheet_id == 0 + ) + if names := ( + (sheet_name,) if isinstance(sheet_name, str) else sheet_name or () + ): + known_sheet_names = {ws["name"] for ws in worksheets} + for name in names: + if name not in known_sheet_names: + raise ValueError( + f"no matching sheet found when `sheet_name` is {name!r}" + ) + sheet_names.append(name) + else: + ids = (sheet_id,) if isinstance(sheet_id, int) else sheet_id or () + sheet_names_by_idx = { + idx: ws["name"] + for idx, ws in enumerate(worksheets, start=1) + if (sheet_id == 0 or ws["index"] in ids or ws["name"] in names) + } + for idx in ids: + if (name := sheet_names_by_idx.get(idx)) is None: # type: ignore[assignment] + raise ValueError( + f"no matching sheet found when `sheet_id` is {idx}" + ) + sheet_names.append(name) + return sheet_names, return_multi + + def _initialise_spreadsheet_parser( engine: Literal["xlsx2csv", "openpyxl", "pyxlsb", "ods"], source: str | BytesIO | Path | BinaryIO | bytes, diff --git a/py-polars/tests/unit/io/test_spreadsheet.py b/py-polars/tests/unit/io/test_spreadsheet.py index a58c5f90cfaa..36af7460ac6f 100644 --- a/py-polars/tests/unit/io/test_spreadsheet.py +++ b/py-polars/tests/unit/io/test_spreadsheet.py @@ -100,18 +100,18 @@ def test_read_excel_multi_sheets( spreadsheet_path = request.getfixturevalue(source) frames_by_id = read_spreadsheet( spreadsheet_path, - sheet_id=[1, 2], + sheet_id=[2, 1], sheet_name=None, **params, ) frames_by_name = read_spreadsheet( spreadsheet_path, sheet_id=None, - sheet_name=["test1", "test2"], + sheet_name=["test2", "test1"], **params, ) for frames in (frames_by_id, frames_by_name): - assert len(frames) == 2 + assert list(frames_by_name) == ["test2", "test1"] expected1 = pl.DataFrame({"hello": ["Row 1", "Row 2"]}) expected2 = pl.DataFrame({"world": ["Row 3", "Row 4"]}) @@ -218,7 +218,7 @@ def test_read_invalid_worksheet( value = sheet_id if param == "id" else sheet_name with pytest.raises( ValueError, - match=f"no matching sheets found when `sheet_{param}` is {value!r}", + match=f"no matching sheet found when `sheet_{param}` is {value!r}", ): read_spreadsheet( spreadsheet_path, sheet_id=sheet_id, sheet_name=sheet_name, **params