Skip to content

Commit

Permalink
docs(python): Add links to read_excel "engine_options" and "read_op…
Browse files Browse the repository at this point in the history
…tions" docstring
  • Loading branch information
alexander-beedie committed Jan 10, 2025
1 parent 17556e4 commit b6d6689
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 51 deletions.
2 changes: 1 addition & 1 deletion py-polars/polars/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
BufferInfo: TypeAlias = tuple[int, int, int]

# type alias for supported spreadsheet engines
ExcelSpreadsheetEngine: TypeAlias = Literal["xlsx2csv", "openpyxl", "calamine"]
ExcelSpreadsheetEngine: TypeAlias = Literal["calamine", "openpyxl", "xlsx2csv"]


class SeriesBuffers(TypedDict):
Expand Down
30 changes: 14 additions & 16 deletions py-polars/polars/io/spreadsheet/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,14 +255,12 @@ def read_excel(
"""
Read Excel spreadsheet data into a DataFrame.
.. versionadded:: 1.20
Support loading data from named table objects with `table_name` parameter.
.. versionadded:: 1.18
Support loading data from a list (or glob pattern) of multiple workbooks.
.. versionchanged:: 1.0
Default engine is now "calamine" (was "xlsx2csv").
.. versionadded:: 0.20.6
Added "calamine" fastexcel engine for Excel Workbooks (.xlsx, .xlsb, .xls).
.. versionadded:: 0.19.3
Added "openpyxl" engine, and added `schema_overrides` parameter.
Parameters
----------
Expand All @@ -283,34 +281,34 @@ def read_excel(
the workbook, so additionally specifying a sheet id or name is optional;
if one of those parameters *is* specified, an error will be raised if
the named table is not found in that particular sheet.
engine : {'calamine', 'xlsx2csv', 'openpyxl'}
engine : {'calamine', 'openpyxl', 'xlsx2csv'}
Library used to parse the spreadsheet file; defaults to "calamine".
* "calamine": this engine can be used for reading all major types of Excel
Workbook (`.xlsx`, `.xlsb`, `.xls`) and is *dramatically* faster than the
other options, using the `fastexcel` module to bind the Calamine parser.
other options, using the `fastexcel` module to bind the Rust Calamine parser.
* "openpyxl": this engine is significantly slower than both `calamine` and
`xlsx2csv`, but can provide a useful fallback if you are otherwise unable
to read data from your workbook.
* "xlsx2csv": converts the data to an in-memory CSV before using the native
polars `read_csv` method to parse the result. You can pass `engine_options`
and `read_options` to refine the conversion.
* "openpyxl": this engine is significantly slower than `xlsx2csv` but supports
additional automatic type inference; potentially useful if you are otherwise
unable to parse your sheet with the `xlsx2csv` engine in conjunction with the
`schema_overrides` parameter.
polars `read_csv` method to parse the result.
engine_options
Additional options passed to the underlying engine's primary parsing
constructor (given below), if supported:
* "calamine": n/a (can only provide `read_options`)
* "xlsx2csv": `Xlsx2csv`
* "openpyxl": `load_workbook`
* "openpyxl": `load_workbook <https://openpyxl.readthedocs.io/en/stable/api/openpyxl.reader.excel.html#openpyxl.reader.excel.load_workbook>`_
* "xlsx2csv": `Xlsx2csv <https://github.com/dilshod/xlsx2csv/blob/f35734aa453d65102198a77e7b8cd04928e6b3a2/xlsx2csv.py#L157>`_
read_options
Options passed to the underlying engine method that reads the sheet data.
Where supported, this allows for additional control over parsing. The
specific read methods associated with each engine are:
* "calamine": `ExcelReader.load_sheet_by_name`
* "xlsx2csv": `pl.read_csv`
* "calamine": `load_sheet_by_name <https://fastexcel.toucantoco.dev/fastexcel.html#ExcelReader.load_sheet_by_name>`_
(or `load_table <https://fastexcel.toucantoco.dev/fastexcel.html#ExcelReader.load_table>`_
if using the `table_name` parameter).
* "openpyxl": n/a (can only provide `engine_options`)
* "xlsx2csv": see :meth:`read_csv`
has_header
Indicate if the first row of the table data is a header or not. If False,
column names will be autogenerated in the following format: `column_x`, with
Expand Down
58 changes: 24 additions & 34 deletions py-polars/tests/unit/io/test_spreadsheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ def test_read_excel_all_sheets(

@pytest.mark.parametrize(
"engine",
["xlsx2csv", "calamine", "openpyxl"],
["calamine", "openpyxl", "xlsx2csv"],
)
def test_read_excel_basic_datatypes(engine: ExcelSpreadsheetEngine) -> None:
df = pl.DataFrame(
Expand Down Expand Up @@ -471,7 +471,7 @@ def test_read_mixed_dtype_columns(
)


@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"])
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
def test_write_excel_bytes(engine: ExcelSpreadsheetEngine) -> None:
df = pl.DataFrame({"colx": [1.5, -2, 0], "coly": ["a", None, "c"]})

Expand Down Expand Up @@ -634,7 +634,7 @@ def test_unsupported_binary_workbook(path_xlsb: Path) -> None:
pl.read_excel(path_xlsb, engine="openpyxl")


@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"])
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
def test_read_excel_all_sheets_with_sheet_name(path_xlsx: Path, engine: str) -> None:
with pytest.raises(
ValueError,
Expand Down Expand Up @@ -793,7 +793,7 @@ def test_excel_round_trip(write_params: dict[str, Any]) -> None:
assert_frame_equal(df, xldf)


@pytest.mark.parametrize("engine", ["xlsx2csv", "calamine"])
@pytest.mark.parametrize("engine", ["calamine", "xlsx2csv"])
def test_excel_write_column_and_row_totals(engine: ExcelSpreadsheetEngine) -> None:
df = pl.DataFrame(
{
Expand Down Expand Up @@ -828,7 +828,7 @@ def test_excel_write_column_and_row_totals(engine: ExcelSpreadsheetEngine) -> No
assert xldf.row(-1) == (None, 0.0, 0.0, 0, 0, None, 0.0, 0)


@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"])
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
def test_excel_write_compound_types(engine: ExcelSpreadsheetEngine) -> None:
df = pl.DataFrame(
{"x": [[1, 2], [3, 4], [5, 6]], "y": ["a", "b", "c"], "z": [9, 8, 7]}
Expand Down Expand Up @@ -925,7 +925,7 @@ def test_excel_write_to_file_object(
assert_frame_equal(df, pl.read_excel(src, engine=engine))


@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"])
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
def test_excel_read_no_headers(engine: ExcelSpreadsheetEngine) -> None:
df = pl.DataFrame(
{"colx": [1, 2, 3], "coly": ["aaa", "bbb", "ccc"], "colz": [0.5, 0.0, -1.0]}
Expand All @@ -938,7 +938,7 @@ def test_excel_read_no_headers(engine: ExcelSpreadsheetEngine) -> None:
assert_frame_equal(df, expected)


@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"])
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
def test_excel_write_sparklines(engine: ExcelSpreadsheetEngine) -> None:
from xlsxwriter import Workbook

Expand Down Expand Up @@ -1217,7 +1217,7 @@ def test_excel_mixed_calamine_float_data(io_files_path: Path) -> None:
)


@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"])
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
@pytest.mark.may_fail_auto_streaming # read->scan_csv dispatch, _read_spreadsheet_xlsx2csv needs to be changed not to call `_reorder_columns` on the df
def test_excel_type_inference_with_nulls(engine: ExcelSpreadsheetEngine) -> None:
df = pl.DataFrame(
Expand Down Expand Up @@ -1255,36 +1255,26 @@ def test_excel_type_inference_with_nulls(engine: ExcelSpreadsheetEngine) -> None
assert_frame_equal(df.select(reversed_cols), read_df)


def test_drop_empty_rows(path_empty_rows_excel: Path) -> None:
df1 = pl.read_excel(source=path_empty_rows_excel, engine="xlsx2csv")
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
def test_drop_empty_rows(
path_empty_rows_excel: Path, engine: ExcelSpreadsheetEngine
) -> None:
df1 = pl.read_excel(
source=path_empty_rows_excel,
engine=engine,
) # check default
assert df1.shape == (8, 4)

df2 = pl.read_excel(
source=path_empty_rows_excel, engine="xlsx2csv", drop_empty_rows=True
source=path_empty_rows_excel,
engine=engine,
drop_empty_rows=True,
)
assert df2.shape == (8, 4)

df3 = pl.read_excel(
source=path_empty_rows_excel, engine="xlsx2csv", drop_empty_rows=False
source=path_empty_rows_excel,
engine=engine,
drop_empty_rows=False,
)
assert df3.shape == (10, 4)

df4 = pl.read_excel(source=path_empty_rows_excel, engine="openpyxl")
assert df4.shape == (8, 4)
df5 = pl.read_excel(
source=path_empty_rows_excel, engine="openpyxl", drop_empty_rows=True
)
assert df5.shape == (8, 4)
df6 = pl.read_excel(
source=path_empty_rows_excel, engine="openpyxl", drop_empty_rows=False
)
assert df6.shape == (10, 4)

df7 = pl.read_excel(source=path_empty_rows_excel, engine="calamine")
assert df7.shape == (8, 4)
df8 = pl.read_excel(
source=path_empty_rows_excel, engine="calamine", drop_empty_rows=True
)
assert df8.shape == (8, 4)
df9 = pl.read_excel(
source=path_empty_rows_excel, engine="calamine", drop_empty_rows=False
)
assert df9.shape == (10, 4)

0 comments on commit b6d6689

Please sign in to comment.