Skip to content

Commit

Permalink
fix(python): return frames from read_excel in the originally specif…
Browse files Browse the repository at this point in the history
…ied order (pola-rs#12243)
  • Loading branch information
alexander-beedie authored Nov 5, 2023
1 parent 977d35c commit 9947241
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 29 deletions.
76 changes: 51 additions & 25 deletions py-polars/polars/io/spreadsheet/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,11 +403,6 @@ def _read_spreadsheet(
*,
raise_if_empty: bool = True,
) -> pl.DataFrame | dict[str, pl.DataFrame]:
if sheet_id is not None and sheet_name is not None:
raise ValueError(
f"cannot specify both `sheet_name` ({sheet_name!r}) and `sheet_id` ({sheet_id!r})"
)

if isinstance(source, (str, Path)):
source = normalize_filepath(source)

Expand All @@ -421,27 +416,9 @@ def _read_spreadsheet(
reader_fn, parser, worksheets = _initialise_spreadsheet_parser(
engine, source, engine_options or {}
)

# determine which named worksheets to read
if sheet_id is None and sheet_name is None:
sheet_names = [worksheets[0]["name"]]
return_multi = False
else:
return_multi = (
sheet_id == 0
or isinstance(sheet_id, Sequence)
or (isinstance(sheet_name, Sequence) and not isinstance(sheet_name, str))
)
ids = (sheet_id,) if isinstance(sheet_id, int) else sheet_id or ()
names = (sheet_name,) if isinstance(sheet_name, str) else sheet_name or ()
sheet_names = [
ws["name"]
for ws in worksheets
if (sheet_id == 0 or ws["index"] in ids or ws["name"] in names)
]

# read data from the indicated sheet(s)
try:
# parse data from the indicated sheet(s)
sheet_names, return_multi = _get_sheet_names(sheet_id, sheet_name, worksheets)
parsed_sheets = {
name: reader_fn(
parser=parser,
Expand All @@ -465,6 +442,55 @@ def _read_spreadsheet(
return next(iter(parsed_sheets.values()))


def _get_sheet_names(
sheet_id: int | Sequence[int] | None,
sheet_name: str | list[str] | tuple[str] | None,
worksheets: list[dict[str, Any]],
) -> tuple[list[str], bool]:
"""Establish sheets to read; indicate if we are returning a dict frames."""
if sheet_id is not None and sheet_name is not None:
raise ValueError(
f"cannot specify both `sheet_name` ({sheet_name!r}) and `sheet_id` ({sheet_id!r})"
)
sheet_names = []
if sheet_id is None and sheet_name is None:
sheet_names.append(worksheets[0]["name"])
return_multi = False
elif sheet_id == 0:
sheet_names.extend(ws["name"] for ws in worksheets)
return_multi = True
else:
return_multi = (
(isinstance(sheet_name, Sequence) and not isinstance(sheet_name, str))
or isinstance(sheet_id, Sequence)
or sheet_id == 0
)
if names := (
(sheet_name,) if isinstance(sheet_name, str) else sheet_name or ()
):
known_sheet_names = {ws["name"] for ws in worksheets}
for name in names:
if name not in known_sheet_names:
raise ValueError(
f"no matching sheet found when `sheet_name` is {name!r}"
)
sheet_names.append(name)
else:
ids = (sheet_id,) if isinstance(sheet_id, int) else sheet_id or ()
sheet_names_by_idx = {
idx: ws["name"]
for idx, ws in enumerate(worksheets, start=1)
if (sheet_id == 0 or ws["index"] in ids or ws["name"] in names)
}
for idx in ids:
if (name := sheet_names_by_idx.get(idx)) is None: # type: ignore[assignment]
raise ValueError(
f"no matching sheet found when `sheet_id` is {idx}"
)
sheet_names.append(name)
return sheet_names, return_multi


def _initialise_spreadsheet_parser(
engine: Literal["xlsx2csv", "openpyxl", "pyxlsb", "ods"],
source: str | BytesIO | Path | BinaryIO | bytes,
Expand Down
8 changes: 4 additions & 4 deletions py-polars/tests/unit/io/test_spreadsheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,18 +100,18 @@ def test_read_excel_multi_sheets(
spreadsheet_path = request.getfixturevalue(source)
frames_by_id = read_spreadsheet(
spreadsheet_path,
sheet_id=[1, 2],
sheet_id=[2, 1],
sheet_name=None,
**params,
)
frames_by_name = read_spreadsheet(
spreadsheet_path,
sheet_id=None,
sheet_name=["test1", "test2"],
sheet_name=["test2", "test1"],
**params,
)
for frames in (frames_by_id, frames_by_name):
assert len(frames) == 2
assert list(frames_by_name) == ["test2", "test1"]

expected1 = pl.DataFrame({"hello": ["Row 1", "Row 2"]})
expected2 = pl.DataFrame({"world": ["Row 3", "Row 4"]})
Expand Down Expand Up @@ -218,7 +218,7 @@ def test_read_invalid_worksheet(
value = sheet_id if param == "id" else sheet_name
with pytest.raises(
ValueError,
match=f"no matching sheets found when `sheet_{param}` is {value!r}",
match=f"no matching sheet found when `sheet_{param}` is {value!r}",
):
read_spreadsheet(
spreadsheet_path, sheet_id=sheet_id, sheet_name=sheet_name, **params
Expand Down

0 comments on commit 9947241

Please sign in to comment.