Skip to content

Commit

Permalink
feat(python): Add 'drop_empty_rows' parameter for read_excel (#18253)
Browse files Browse the repository at this point in the history
Co-authored-by: Rashikraj Shrestha <[email protected]>
  • Loading branch information
Rashik-raj and Rashikraj Shrestha authored Oct 11, 2024
1 parent 251e171 commit fc970f7
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 8 deletions.
45 changes: 38 additions & 7 deletions py-polars/polars/io/spreadsheet/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def read_excel(
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
raise_if_empty: bool = ...,
drop_empty_rows: bool = ...,
) -> pl.DataFrame: ...


Expand All @@ -73,6 +74,7 @@ def read_excel(
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
raise_if_empty: bool = ...,
drop_empty_rows: bool = ...,
) -> pl.DataFrame: ...


Expand All @@ -90,6 +92,7 @@ def read_excel(
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
raise_if_empty: bool = ...,
drop_empty_rows: bool = ...,
) -> NoReturn: ...


Expand All @@ -109,6 +112,7 @@ def read_excel(
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
raise_if_empty: bool = ...,
drop_empty_rows: bool = ...,
) -> dict[str, pl.DataFrame]: ...


Expand All @@ -126,6 +130,7 @@ def read_excel(
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
raise_if_empty: bool = ...,
drop_empty_rows: bool = ...,
) -> pl.DataFrame: ...


Expand All @@ -143,6 +148,7 @@ def read_excel(
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
raise_if_empty: bool = ...,
drop_empty_rows: bool = ...,
) -> dict[str, pl.DataFrame]: ...


Expand All @@ -161,6 +167,7 @@ def read_excel(
schema_overrides: SchemaDict | None = None,
infer_schema_length: int | None = N_INFER_DEFAULT,
raise_if_empty: bool = True,
drop_empty_rows: bool = True,
) -> pl.DataFrame | dict[str, pl.DataFrame]:
"""
Read Excel spreadsheet data into a DataFrame.
Expand Down Expand Up @@ -232,6 +239,9 @@ def read_excel(
raise_if_empty
When there is no data in the sheet,`NoDataError` is raised. If this parameter
is set to False, an empty DataFrame (with no columns) is returned instead.
drop_empty_rows
A boolean flag whether to drop empty rows or not from the dataframe. Default
is True.
Returns
-------
Expand Down Expand Up @@ -299,6 +309,7 @@ def read_excel(
raise_if_empty=raise_if_empty,
has_header=has_header,
columns=columns,
drop_empty_rows=drop_empty_rows,
)


Expand Down Expand Up @@ -530,6 +541,7 @@ def _read_spreadsheet(
columns: Sequence[int] | Sequence[str] | None = None,
has_header: bool = True,
raise_if_empty: bool = True,
drop_empty_rows: bool = True,
) -> pl.DataFrame | dict[str, pl.DataFrame]:
if isinstance(source, (str, Path)):
source = normalize_filepath(source)
Expand Down Expand Up @@ -561,6 +573,7 @@ def _read_spreadsheet(
read_options=read_options,
raise_if_empty=raise_if_empty,
columns=columns,
drop_empty_rows=drop_empty_rows,
)
for name in sheet_names
}
Expand Down Expand Up @@ -749,6 +762,7 @@ def _csv_buffer_to_frame(
read_options: dict[str, Any],
schema_overrides: SchemaDict | None,
raise_if_empty: bool,
drop_empty_rows: bool,
) -> pl.DataFrame:
"""Translate StringIO buffer containing delimited data as a DataFrame."""
# handle (completely) empty sheet data
Expand Down Expand Up @@ -782,11 +796,19 @@ def _csv_buffer_to_frame(
separator=separator,
**read_options,
)
return _drop_null_data(df, raise_if_empty=raise_if_empty)
return _drop_null_data(
df, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows
)


def _drop_null_data(df: pl.DataFrame, *, raise_if_empty: bool) -> pl.DataFrame:
"""If DataFrame contains columns/rows that contain only nulls, drop them."""
def _drop_null_data(
df: pl.DataFrame, *, raise_if_empty: bool, drop_empty_rows: bool = True
) -> pl.DataFrame:
"""
If DataFrame contains columns/rows that contain only nulls, drop them.
If `drop_empty_rows` is set to `False`, empty rows are not dropped.
"""
null_cols = []
for col_name in df.columns:
# note that if multiple unnamed columns are found then all but the first one
Expand All @@ -807,8 +829,9 @@ def _drop_null_data(df: pl.DataFrame, *, raise_if_empty: bool) -> pl.DataFrame:

if len(df) == 0 and len(df.columns) == 0:
return _empty_frame(raise_if_empty)

return df.filter(~F.all_horizontal(F.all().is_null()))
if drop_empty_rows:
return df.filter(~F.all_horizontal(F.all().is_null()))
return df


def _empty_frame(raise_if_empty: bool) -> pl.DataFrame: # noqa: FBT001
Expand Down Expand Up @@ -840,6 +863,7 @@ def _read_spreadsheet_openpyxl(
schema_overrides: SchemaDict | None,
columns: Sequence[int] | Sequence[str] | None,
raise_if_empty: bool,
drop_empty_rows: bool,
) -> pl.DataFrame:
"""Use the 'openpyxl' library to read data from the given worksheet."""
infer_schema_length = read_options.pop("infer_schema_length", None)
Expand Down Expand Up @@ -896,7 +920,9 @@ def _read_spreadsheet_openpyxl(
strict=False,
)

df = _drop_null_data(df, raise_if_empty=raise_if_empty)
df = _drop_null_data(
df, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows
)
df = _reorder_columns(df, columns)
return df

Expand All @@ -909,6 +935,7 @@ def _read_spreadsheet_calamine(
schema_overrides: SchemaDict | None,
columns: Sequence[int] | Sequence[str] | None,
raise_if_empty: bool,
drop_empty_rows: bool,
) -> pl.DataFrame:
# if we have 'schema_overrides' and a more recent version of `fastexcel`
# we can pass translated dtypes to the engine to refine the initial parse
Expand Down Expand Up @@ -966,7 +993,9 @@ def _read_spreadsheet_calamine(
if schema_overrides:
df = df.cast(dtypes=schema_overrides)

df = _drop_null_data(df, raise_if_empty=raise_if_empty)
df = _drop_null_data(
df, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows
)

# standardise on string dtype for null columns in empty frame
if df.is_empty():
Expand Down Expand Up @@ -1009,6 +1038,7 @@ def _read_spreadsheet_xlsx2csv(
schema_overrides: SchemaDict | None,
columns: Sequence[int] | Sequence[str] | None,
raise_if_empty: bool,
drop_empty_rows: bool,
) -> pl.DataFrame:
"""Use the 'xlsx2csv' library to read data from the given worksheet."""
csv_buffer = StringIO()
Expand All @@ -1031,6 +1061,7 @@ def _read_spreadsheet_xlsx2csv(
read_options=read_options,
schema_overrides=schema_overrides,
raise_if_empty=raise_if_empty,
drop_empty_rows=drop_empty_rows,
)
if cast_to_boolean:
df = df.with_columns(*cast_to_boolean)
Expand Down
Binary file not shown.
42 changes: 41 additions & 1 deletion py-polars/tests/unit/io/test_spreadsheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

from polars._typing import ExcelSpreadsheetEngine, SelectorType

pytestmark = pytest.mark.slow()
# pytestmark = pytest.mark.slow()


@pytest.fixture
Expand Down Expand Up @@ -83,6 +83,11 @@ def path_ods_mixed(io_files_path: Path) -> Path:
return io_files_path / "mixed.ods"


@pytest.fixture()
def path_empty_rows_excel(io_files_path: Path) -> Path:
return io_files_path / "test_empty_rows.xlsx"


@pytest.mark.parametrize(
("read_spreadsheet", "source", "engine_params"),
[
Expand Down Expand Up @@ -1060,3 +1065,38 @@ def test_identify_workbook(
bytesio_data = BytesIO(f.read())
assert _identify_workbook(bytesio_data) == file_type
assert isinstance(pl.read_excel(bytesio_data, engine="calamine"), pl.DataFrame)


def test_drop_empty_rows(path_empty_rows_excel: Path) -> None:
df1 = pl.read_excel(source=path_empty_rows_excel, engine="xlsx2csv")
assert df1.shape == (8, 4)
df2 = pl.read_excel(
source=path_empty_rows_excel, engine="xlsx2csv", drop_empty_rows=True
)
assert df2.shape == (8, 4)
df3 = pl.read_excel(
source=path_empty_rows_excel, engine="xlsx2csv", drop_empty_rows=False
)
assert df3.shape == (10, 4)

df4 = pl.read_excel(source=path_empty_rows_excel, engine="openpyxl")
assert df4.shape == (8, 4)
df5 = pl.read_excel(
source=path_empty_rows_excel, engine="openpyxl", drop_empty_rows=True
)
assert df5.shape == (8, 4)
df6 = pl.read_excel(
source=path_empty_rows_excel, engine="openpyxl", drop_empty_rows=False
)
assert df6.shape == (10, 4)

df7 = pl.read_excel(source=path_empty_rows_excel, engine="calamine")
assert df7.shape == (8, 4)
df8 = pl.read_excel(
source=path_empty_rows_excel, engine="calamine", drop_empty_rows=True
)
assert df8.shape == (8, 4)
df9 = pl.read_excel(
source=path_empty_rows_excel, engine="calamine", drop_empty_rows=False
)
assert df9.shape == (10, 4)

0 comments on commit fc970f7

Please sign in to comment.