diff --git a/py-polars/docs/source/reference/io.rst b/py-polars/docs/source/reference/io.rst index 474e7576a652..d3c45469f94a 100644 --- a/py-polars/docs/source/reference/io.rst +++ b/py-polars/docs/source/reference/io.rst @@ -45,8 +45,8 @@ Delta Lake .. autosummary:: :toctree: api/ - scan_delta read_delta + scan_delta DataFrame.write_delta Excel / ODS @@ -64,9 +64,9 @@ Feather / IPC :toctree: api/ read_ipc + read_ipc_schema read_ipc_stream scan_ipc - read_ipc_schema DataFrame.write_ipc DataFrame.write_ipc_stream LazyFrame.sink_ipc @@ -96,8 +96,8 @@ Parquet :toctree: api/ read_parquet - scan_parquet read_parquet_schema + scan_parquet DataFrame.write_parquet LazyFrame.sink_parquet diff --git a/py-polars/polars/io/__init__.py b/py-polars/polars/io/__init__.py index f4a39b5f778a..395f15bd4c94 100644 --- a/py-polars/polars/io/__init__.py +++ b/py-polars/polars/io/__init__.py @@ -21,8 +21,8 @@ "read_delta", "read_excel", "read_ipc", - "read_ipc_stream", "read_ipc_schema", + "read_ipc_stream", "read_json", "read_ndjson", "read_ods", diff --git a/py-polars/polars/io/csv/__init__.py b/py-polars/polars/io/csv/__init__.py index b18232f10346..cf5a2646240d 100644 --- a/py-polars/polars/io/csv/__init__.py +++ b/py-polars/polars/io/csv/__init__.py @@ -1,6 +1,8 @@ +from polars.io.csv.batched_reader import BatchedCsvReader from polars.io.csv.functions import read_csv, read_csv_batched, scan_csv __all__ = [ + "BatchedCsvReader", "read_csv", "read_csv_batched", "scan_csv", diff --git a/py-polars/polars/io/csv/batched_reader.py b/py-polars/polars/io/csv/batched_reader.py index 201b578be964..101672f7a5e9 100644 --- a/py-polars/polars/io/csv/batched_reader.py +++ b/py-polars/polars/io/csv/batched_reader.py @@ -110,14 +110,12 @@ def next_batches(self, n: int) -> list[DataFrame] | None: """ Read `n` batches from the reader. - The `n` chunks will be parallelized over the - available threads. + These batches will be parallelized over the available threads. Parameters ---------- n - Number of chunks to fetch. - This is ideally >= number of threads + Number of chunks to fetch; ideally this is >= number of threads. Examples -------- diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index a940e26f3697..6809f30e7978 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -16,6 +16,7 @@ import polars as pl from polars._utils.various import normalize_filepath from polars.exceptions import ComputeError, NoDataError +from polars.io.csv import BatchedCsvReader from polars.testing import assert_frame_equal, assert_series_equal if TYPE_CHECKING: @@ -1414,8 +1415,9 @@ def test_csv_categorical_categorical_merge() -> None: def test_batched_csv_reader(foods_file_path: Path) -> None: reader = pl.read_csv_batched(foods_file_path, batch_size=4) - batches = reader.next_batches(5) + assert isinstance(reader, BatchedCsvReader) + batches = reader.next_batches(5) assert batches is not None assert len(batches) == 5 assert batches[0].to_dict(as_series=False) == { @@ -1431,10 +1433,12 @@ def test_batched_csv_reader(foods_file_path: Path) -> None: "sugars_g": [25, 0, 5, 11], } assert_frame_equal(pl.concat(batches), pl.read_csv(foods_file_path)) + # the final batch of the low-memory variant is different reader = pl.read_csv_batched(foods_file_path, batch_size=4, low_memory=True) batches = reader.next_batches(5) assert len(batches) == 5 # type: ignore[arg-type] + batches += reader.next_batches(5) # type: ignore[operator] assert_frame_equal(pl.concat(batches), pl.read_csv(foods_file_path))