From ea25c40bc84a58ebd7cec1bf0ebab83dc7491a36 Mon Sep 17 00:00:00 2001
From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com>
Date: Fri, 4 Oct 2024 18:07:43 +0200
Subject: [PATCH 01/10] refactor:  use polars parquet reader

---
 py-polars/polars/io/delta.py          | 115 +++++++++++++++-----------
 py-polars/tests/unit/io/test_delta.py |  24 ------
 2 files changed, 68 insertions(+), 71 deletions(-)

diff --git a/py-polars/polars/io/delta.py b/py-polars/polars/io/delta.py
index 9cb621fc5534..8cc4a7b65fad 100644
--- a/py-polars/polars/io/delta.py
+++ b/py-polars/polars/io/delta.py
@@ -5,14 +5,15 @@
 from typing import TYPE_CHECKING, Any
 from urllib.parse import urlparse
 
-from polars.convert import from_arrow
+from polars.dataframe import DataFrame
 from polars.datatypes import Null, Time
 from polars.datatypes.convert import unpack_dtypes
 from polars.dependencies import _DELTALAKE_AVAILABLE, deltalake
-from polars.io.pyarrow_dataset import scan_pyarrow_dataset
+from polars.io.parquet import scan_parquet
+from polars.schema import Schema
 
 if TYPE_CHECKING:
-    from polars import DataFrame, DataType, LazyFrame
+    from polars import DataType, LazyFrame
 
 
 def read_delta(
@@ -20,10 +21,8 @@ def read_delta(
     *,
     version: int | str | datetime | None = None,
     columns: list[str] | None = None,
-    rechunk: bool = False,
     storage_options: dict[str, Any] | None = None,
     delta_table_options: dict[str, Any] | None = None,
-    pyarrow_options: dict[str, Any] | None = None,
 ) -> DataFrame:
     """
     Reads into a DataFrame from a Delta lake table.
@@ -42,9 +41,6 @@ def read_delta(
         table is read.
     columns
         Columns to select. Accepts a list of column names.
-    rechunk
-        Make sure that all columns are contiguous in memory by
-        aggregating the chunks into a single array.
     storage_options
         Extra options for the storage backends supported by `deltalake`.
         For cloud storages, this may include configurations for authentication etc.
@@ -53,8 +49,6 @@ def read_delta(
         <https://delta-io.github.io/delta-rs/usage/loading-table/>`__.
     delta_table_options
         Additional keyword arguments while reading a Delta lake Table.
-    pyarrow_options
-        Keyword arguments while converting a Delta lake Table to pyarrow table.
 
     Returns
     -------
@@ -68,15 +62,6 @@ def read_delta(
     >>> table_path = "/path/to/delta-table/"
     >>> pl.read_delta(table_path)  # doctest: +SKIP
 
-    Use the `pyarrow_options` parameter to read only certain partitions.
-    Note: This should be preferred over using an equivalent `.filter()` on the resulting
-    DataFrame, as this avoids reading the data at all.
-
-    >>> pl.read_delta(  # doctest: +SKIP
-    ...     table_path,
-    ...     pyarrow_options={"partitions": [("year", "=", "2021")]},
-    ... )
-
     Reads a specific version of the Delta table from local filesystem.
     Note: This will fail if the provided version of the delta table does not exist.
 
@@ -135,21 +120,15 @@ def read_delta(
     ...     table_path, delta_table_options=delta_table_options
     ... )  # doctest: +SKIP
     """
-    if pyarrow_options is None:
-        pyarrow_options = {}
-
-    resolved_uri = _resolve_delta_lake_uri(source)
-
-    dl_tbl = _get_delta_lake_table(
-        table_path=resolved_uri,
+    df = scan_delta(
+        source=source,
         version=version,
         storage_options=storage_options,
         delta_table_options=delta_table_options,
     )
-
-    return from_arrow(
-        dl_tbl.to_pyarrow_table(columns=columns, **pyarrow_options), rechunk=rechunk
-    )  # type: ignore[return-value]
+    if columns is not None:
+        df = df.select(columns)
+    return df.collect()
 
 
 def scan_delta(
@@ -158,7 +137,6 @@ def scan_delta(
     version: int | str | datetime | None = None,
     storage_options: dict[str, Any] | None = None,
     delta_table_options: dict[str, Any] | None = None,
-    pyarrow_options: dict[str, Any] | None = None,
 ) -> LazyFrame:
     """
     Lazily read from a Delta lake table.
@@ -183,10 +161,6 @@ def scan_delta(
         <https://delta-io.github.io/delta-rs/usage/loading-table/>`__.
     delta_table_options
         Additional keyword arguments while reading a Delta lake Table.
-    pyarrow_options
-        Keyword arguments while converting a Delta lake Table to pyarrow table.
-        Use this parameter when filtering on partitioned columns or to read
-        from a 'fsspec' supported filesystem.
 
     Returns
     -------
@@ -200,13 +174,6 @@ def scan_delta(
     >>> table_path = "/path/to/delta-table/"
     >>> pl.scan_delta(table_path).collect()  # doctest: +SKIP
 
-    Use the `pyarrow_options` parameter to read only certain partitions.
-
-    >>> pl.scan_delta(  # doctest: +SKIP
-    ...     table_path,
-    ...     pyarrow_options={"partitions": [("year", "=", "2021")]},
-    ... )
-
     Creates a scan for a specific version of the Delta table from local filesystem.
     Note: This will fail if the provided version of the delta table does not exist.
 
@@ -271,9 +238,6 @@ def scan_delta(
     ...     table_path, delta_table_options=delta_table_options
     ... ).collect()  # doctest: +SKIP
     """
-    if pyarrow_options is None:
-        pyarrow_options = {}
-
     resolved_uri = _resolve_delta_lake_uri(source)
     dl_tbl = _get_delta_lake_table(
         table_path=resolved_uri,
@@ -281,9 +245,66 @@ def scan_delta(
         storage_options=storage_options,
         delta_table_options=delta_table_options,
     )
+    import pyarrow as pa
+    from deltalake.exceptions import DeltaProtocolError
+    from deltalake.table import (
+        MAX_SUPPORTED_READER_VERSION,
+        NOT_SUPPORTED_READER_VERSION,
+        SUPPORTED_READER_FEATURES,
+    )
 
-    pa_ds = dl_tbl.to_pyarrow_dataset(**pyarrow_options)
-    return scan_pyarrow_dataset(pa_ds)
+    table_protocol = dl_tbl.protocol()
+    if (
+        table_protocol.min_reader_version > MAX_SUPPORTED_READER_VERSION
+        or table_protocol.min_reader_version == NOT_SUPPORTED_READER_VERSION
+    ):
+        msg = (
+            f"The table's minimum reader version is {table_protocol.min_reader_version} "
+            f"but deltalake only supports version 1 or {MAX_SUPPORTED_READER_VERSION} with these reader features: {SUPPORTED_READER_FEATURES}"
+        )
+        raise DeltaProtocolError(msg)
+    if (
+        table_protocol.min_reader_version >= 3
+        and table_protocol.reader_features is not None
+    ):
+        missing_features = {*table_protocol.reader_features}.difference(
+            SUPPORTED_READER_FEATURES
+        )
+        if len(missing_features) > 0:
+            msg = f"The table has set these reader features: {missing_features} but these are not yet supported by the deltalake reader."
+            raise DeltaProtocolError(msg)
+
+    delta_schema = dl_tbl.schema().to_pyarrow(as_large_types=True)
+    polars_schema = DataFrame(pa.Table.from_pylist([], delta_schema)).schema
+    partition_columns = dl_tbl.metadata().partition_columns
+
+    def _split_schema(
+        schema: Schema, partition_columns: list[str]
+    ) -> tuple[Schema, Schema]:
+        if len(partition_columns) == 0:
+            return schema, Schema([])
+        main_schema = []
+        hive_schema = []
+
+        for name, dtype in schema.items():
+            if name in partition_columns:
+                hive_schema.append((name, dtype))
+            else:
+                main_schema.append((name, dtype))
+
+        return Schema(main_schema), Schema(hive_schema)
+
+    # Required because main_schema cannot contain hive columns currently
+    main_schema, hive_schema = _split_schema(polars_schema, partition_columns)
+
+    return scan_parquet(
+        dl_tbl.file_uris(),
+        schema=main_schema,
+        hive_schema=hive_schema,
+        allow_missing_columns=True,
+        hive_partitioning=len(partition_columns) > 0,
+        storage_options=storage_options,
+    )
 
 
 def _resolve_delta_lake_uri(table_uri: str, *, strict: bool = True) -> str:
diff --git a/py-polars/tests/unit/io/test_delta.py b/py-polars/tests/unit/io/test_delta.py
index 33c6b052ffdf..99c80589b81a 100644
--- a/py-polars/tests/unit/io/test_delta.py
+++ b/py-polars/tests/unit/io/test_delta.py
@@ -69,18 +69,6 @@ def test_scan_delta_columns(delta_table_path: Path) -> None:
     assert_frame_equal(expected, ldf.collect(), check_dtypes=False)
 
 
-def test_scan_delta_filesystem(delta_table_path: Path) -> None:
-    raw_filesystem = pyarrow.fs.LocalFileSystem()
-    fs = pyarrow.fs.SubTreeFileSystem(str(delta_table_path), raw_filesystem)
-
-    ldf = pl.scan_delta(
-        str(delta_table_path), version=0, pyarrow_options={"filesystem": fs}
-    )
-
-    expected = pl.DataFrame({"name": ["Joey", "Ivan"], "age": [14, 32]})
-    assert_frame_equal(expected, ldf.collect(), check_dtypes=False)
-
-
 def test_scan_delta_relative(delta_table_path: Path) -> None:
     rel_delta_table_path = str(delta_table_path / ".." / "delta-table")
 
@@ -142,18 +130,6 @@ def test_read_delta_columns(delta_table_path: Path) -> None:
     assert_frame_equal(expected, df, check_dtypes=False)
 
 
-def test_read_delta_filesystem(delta_table_path: Path) -> None:
-    raw_filesystem = pyarrow.fs.LocalFileSystem()
-    fs = pyarrow.fs.SubTreeFileSystem(str(delta_table_path), raw_filesystem)
-
-    df = pl.read_delta(
-        str(delta_table_path), version=0, pyarrow_options={"filesystem": fs}
-    )
-
-    expected = pl.DataFrame({"name": ["Joey", "Ivan"], "age": [14, 32]})
-    assert_frame_equal(expected, df, check_dtypes=False)
-
-
 def test_read_delta_relative(delta_table_path: Path) -> None:
     rel_delta_table_path = str(delta_table_path / ".." / "delta-table")
 

From 11723c22c1a282671f3c9f5774dfb164b13b5692 Mon Sep 17 00:00:00 2001
From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com>
Date: Fri, 4 Oct 2024 18:12:23 +0200
Subject: [PATCH 02/10] chore: update msg

---
 py-polars/polars/io/delta.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/py-polars/polars/io/delta.py b/py-polars/polars/io/delta.py
index 8cc4a7b65fad..4b4cd982abb5 100644
--- a/py-polars/polars/io/delta.py
+++ b/py-polars/polars/io/delta.py
@@ -260,7 +260,7 @@ def scan_delta(
     ):
         msg = (
             f"The table's minimum reader version is {table_protocol.min_reader_version} "
-            f"but deltalake only supports version 1 or {MAX_SUPPORTED_READER_VERSION} with these reader features: {SUPPORTED_READER_FEATURES}"
+            f"but polars delta scanner only supports version 1 or {MAX_SUPPORTED_READER_VERSION} with these reader features: {SUPPORTED_READER_FEATURES}"
         )
         raise DeltaProtocolError(msg)
     if (
@@ -271,7 +271,7 @@ def scan_delta(
             SUPPORTED_READER_FEATURES
         )
         if len(missing_features) > 0:
-            msg = f"The table has set these reader features: {missing_features} but these are not yet supported by the deltalake reader."
+            msg = f"The table has set these reader features: {missing_features} but these are not yet supported by the polars delta scanner."
             raise DeltaProtocolError(msg)
 
     delta_schema = dl_tbl.schema().to_pyarrow(as_large_types=True)

From a051365819225eca345cfdc63261f8780c65e0f4 Mon Sep 17 00:00:00 2001
From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com>
Date: Fri, 4 Oct 2024 18:15:35 +0200
Subject: [PATCH 03/10] chore: use from_arrow

---
 py-polars/polars/io/delta.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/py-polars/polars/io/delta.py b/py-polars/polars/io/delta.py
index 4b4cd982abb5..03f8b1953f72 100644
--- a/py-polars/polars/io/delta.py
+++ b/py-polars/polars/io/delta.py
@@ -5,7 +5,7 @@
 from typing import TYPE_CHECKING, Any
 from urllib.parse import urlparse
 
-from polars.dataframe import DataFrame
+from polars.convert import from_arrow
 from polars.datatypes import Null, Time
 from polars.datatypes.convert import unpack_dtypes
 from polars.dependencies import _DELTALAKE_AVAILABLE, deltalake
@@ -13,7 +13,7 @@
 from polars.schema import Schema
 
 if TYPE_CHECKING:
-    from polars import DataType, LazyFrame
+    from polars import DataFrame, DataType, LazyFrame
 
 
 def read_delta(
@@ -275,7 +275,7 @@ def scan_delta(
             raise DeltaProtocolError(msg)
 
     delta_schema = dl_tbl.schema().to_pyarrow(as_large_types=True)
-    polars_schema = DataFrame(pa.Table.from_pylist([], delta_schema)).schema
+    polars_schema = from_arrow(pa.Table.from_pylist([], delta_schema)).schema  # type: ignore
     partition_columns = dl_tbl.metadata().partition_columns
 
     def _split_schema(

From 699c4ffce8f6e331aa46190fbaebe7fa7229ac6c Mon Sep 17 00:00:00 2001
From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com>
Date: Fri, 4 Oct 2024 18:18:22 +0200
Subject: [PATCH 04/10] chore: remove rechunk

---
 py-polars/tests/unit/io/test_delta.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/py-polars/tests/unit/io/test_delta.py b/py-polars/tests/unit/io/test_delta.py
index 99c80589b81a..da4d58deaa05 100644
--- a/py-polars/tests/unit/io/test_delta.py
+++ b/py-polars/tests/unit/io/test_delta.py
@@ -490,5 +490,5 @@ def test_read_parquet_respects_rechunk_16982(
     df.write_delta(str(tmp_path), mode="append")
 
     rechunk, expected_chunks = rechunk_and_expected_chunks
-    result = pl.read_delta(str(tmp_path), rechunk=rechunk)
+    result = pl.read_delta(str(tmp_path))
     assert result.n_chunks() == expected_chunks

From 0103cdd8356cb7e4e0bc9865ac0b818a1fd321b8 Mon Sep 17 00:00:00 2001
From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com>
Date: Sat, 5 Oct 2024 11:36:52 +0200
Subject: [PATCH 05/10] chore: keep deprecated pyarrow options behavior, issue
 warning

---
 py-polars/polars/io/delta.py          | 56 +++++++++++++++++++++++++++
 py-polars/tests/unit/io/test_delta.py | 23 +++--------
 2 files changed, 61 insertions(+), 18 deletions(-)

diff --git a/py-polars/polars/io/delta.py b/py-polars/polars/io/delta.py
index 03f8b1953f72..bd948e75623b 100644
--- a/py-polars/polars/io/delta.py
+++ b/py-polars/polars/io/delta.py
@@ -5,11 +5,13 @@
 from typing import TYPE_CHECKING, Any
 from urllib.parse import urlparse
 
+from polars._utils.deprecation import issue_deprecation_warning
 from polars.convert import from_arrow
 from polars.datatypes import Null, Time
 from polars.datatypes.convert import unpack_dtypes
 from polars.dependencies import _DELTALAKE_AVAILABLE, deltalake
 from polars.io.parquet import scan_parquet
+from polars.io.pyarrow_dataset.functions import scan_pyarrow_dataset
 from polars.schema import Schema
 
 if TYPE_CHECKING:
@@ -21,8 +23,10 @@ def read_delta(
     *,
     version: int | str | datetime | None = None,
     columns: list[str] | None = None,
+    rechunk: bool | None = None,
     storage_options: dict[str, Any] | None = None,
     delta_table_options: dict[str, Any] | None = None,
+    pyarrow_options: dict[str, Any] | None = None,
 ) -> DataFrame:
     """
     Reads into a DataFrame from a Delta lake table.
@@ -41,6 +45,12 @@ def read_delta(
         table is read.
     columns
         Columns to select. Accepts a list of column names.
+    rechunk
+        Make sure that all columns are contiguous in memory by
+        aggregating the chunks into a single array.
+
+        .. deprecated:: 1.10.0
+            Rechunk is automatically done in native reader
     storage_options
         Extra options for the storage backends supported by `deltalake`.
         For cloud storages, this may include configurations for authentication etc.
@@ -49,6 +59,11 @@ def read_delta(
         <https://delta-io.github.io/delta-rs/usage/loading-table/>`__.
     delta_table_options
         Additional keyword arguments while reading a Delta lake Table.
+    pyarrow_options
+        Keyword arguments while converting a Delta lake Table to pyarrow table.
+
+        .. deprecated:: 1.10.0
+            Remove pyarrow_options and use native polars filter, selection.
 
     Returns
     -------
@@ -120,6 +135,30 @@ def read_delta(
     ...     table_path, delta_table_options=delta_table_options
     ... )  # doctest: +SKIP
     """
+    if pyarrow_options is not None:
+        issue_deprecation_warning(
+            message="`pyarrow_options` are deprecated, polars native parquet reader is used when not passing pyarrow options.",
+            version="1.10",
+        )
+        resolved_uri = _resolve_delta_lake_uri(source)
+
+        dl_tbl = _get_delta_lake_table(
+            table_path=resolved_uri,
+            version=version,
+            storage_options=storage_options,
+            delta_table_options=delta_table_options,
+        )
+        if rechunk is None:
+            rechunk = False
+        return from_arrow(
+            dl_tbl.to_pyarrow_table(columns=columns, **pyarrow_options), rechunk=rechunk
+        )  # type: ignore[return-value]
+
+    if rechunk is not None:
+        issue_deprecation_warning(
+            message="`rechunk` is deprecated, this is automatically done now.",
+            version="1.10",
+        )
     df = scan_delta(
         source=source,
         version=version,
@@ -137,6 +176,7 @@ def scan_delta(
     version: int | str | datetime | None = None,
     storage_options: dict[str, Any] | None = None,
     delta_table_options: dict[str, Any] | None = None,
+    pyarrow_options: dict[str, Any] | None = None,
 ) -> LazyFrame:
     """
     Lazily read from a Delta lake table.
@@ -161,6 +201,13 @@ def scan_delta(
         <https://delta-io.github.io/delta-rs/usage/loading-table/>`__.
     delta_table_options
         Additional keyword arguments while reading a Delta lake Table.
+    pyarrow_options
+        Keyword arguments while converting a Delta lake Table to pyarrow table.
+        Use this parameter when filtering on partitioned columns or to read
+        from a 'fsspec' supported filesystem.
+
+        .. deprecated:: 1.10.0
+            Remove pyarrow_options and use native polars filter, selection.
 
     Returns
     -------
@@ -245,6 +292,15 @@ def scan_delta(
         storage_options=storage_options,
         delta_table_options=delta_table_options,
     )
+
+    if pyarrow_options is not None:
+        issue_deprecation_warning(
+            message="PyArrow options are deprecated, polars native parquet scanner is used when not passing pyarrow options.",
+            version="1.10",
+        )
+        pa_ds = dl_tbl.to_pyarrow_dataset(**pyarrow_options)
+        return scan_pyarrow_dataset(pa_ds)
+
     import pyarrow as pa
     from deltalake.exceptions import DeltaProtocolError
     from deltalake.table import (
diff --git a/py-polars/tests/unit/io/test_delta.py b/py-polars/tests/unit/io/test_delta.py
index da4d58deaa05..0276eec34525 100644
--- a/py-polars/tests/unit/io/test_delta.py
+++ b/py-polars/tests/unit/io/test_delta.py
@@ -184,7 +184,7 @@ def test_write_delta(df: pl.DataFrame, tmp_path: Path) -> None:
     assert v1.columns == pl_df_1.columns
 
     assert df_supported.shape == pl_df_partitioned.shape
-    assert df_supported.columns == pl_df_partitioned.columns
+    assert sorted(df_supported.columns) == sorted(pl_df_partitioned.columns)
 
     assert tbl.version() == 1
     assert partitioned_tbl.version() == 0
@@ -216,7 +216,7 @@ def test_write_delta(df: pl.DataFrame, tmp_path: Path) -> None:
 
     assert partitioned_tbl.version() == 1
     assert pl_df_partitioned.shape == (6, 14)  # Rows are doubled
-    assert df_supported.columns == pl_df_partitioned.columns
+    assert sorted(df_supported.columns) == sorted(pl_df_partitioned.columns)
 
     df_supported.write_delta(partitioned_tbl_uri, mode="overwrite")
 
@@ -470,25 +470,12 @@ def test_unsupported_dtypes(tmp_path: Path) -> None:
         df.write_delta(tmp_path / "time")
 
 
+@pytest.mark.skip(
+    reason="upstream bug in delta-rs causing categorical to be written as categorical in parquet"
+)
 @pytest.mark.write_disk
 def test_categorical_becomes_string(tmp_path: Path) -> None:
     df = pl.DataFrame({"a": ["A", "B", "A"]}, schema={"a": pl.Categorical})
     df.write_delta(tmp_path)
     df2 = pl.read_delta(str(tmp_path))
     assert_frame_equal(df2, pl.DataFrame({"a": ["A", "B", "A"]}, schema={"a": pl.Utf8}))
-
-
-@pytest.mark.write_disk
-@pytest.mark.parametrize("rechunk_and_expected_chunks", [(True, 1), (False, 3)])
-def test_read_parquet_respects_rechunk_16982(
-    rechunk_and_expected_chunks: tuple[bool, int], tmp_path: Path
-) -> None:
-    # Create a delta lake table with 3 chunks:
-    df = pl.DataFrame({"a": [1]})
-    df.write_delta(str(tmp_path))
-    df.write_delta(str(tmp_path), mode="append")
-    df.write_delta(str(tmp_path), mode="append")
-
-    rechunk, expected_chunks = rechunk_and_expected_chunks
-    result = pl.read_delta(str(tmp_path))
-    assert result.n_chunks() == expected_chunks

From e3ef45a70fa1c87ca7c2faeaf54cf7d33730f03d Mon Sep 17 00:00:00 2001
From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com>
Date: Sat, 5 Oct 2024 11:41:50 +0200
Subject: [PATCH 06/10] chore: ignore

---
 py-polars/polars/io/delta.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/py-polars/polars/io/delta.py b/py-polars/polars/io/delta.py
index bd948e75623b..33b6f3140696 100644
--- a/py-polars/polars/io/delta.py
+++ b/py-polars/polars/io/delta.py
@@ -331,7 +331,7 @@ def scan_delta(
             raise DeltaProtocolError(msg)
 
     delta_schema = dl_tbl.schema().to_pyarrow(as_large_types=True)
-    polars_schema = from_arrow(pa.Table.from_pylist([], delta_schema)).schema  # type: ignore
+    polars_schema = from_arrow(pa.Table.from_pylist([], delta_schema)).schema  # type: ignore[union-attr]
     partition_columns = dl_tbl.metadata().partition_columns
 
     def _split_schema(

From 3aa60c869d627dbc6e59b642836d69eed2f9ced9 Mon Sep 17 00:00:00 2001
From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com>
Date: Sat, 2 Nov 2024 11:08:27 +0100
Subject: [PATCH 07/10] chore: fmt

---
 py-polars/tests/unit/io/test_delta.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/py-polars/tests/unit/io/test_delta.py b/py-polars/tests/unit/io/test_delta.py
index aeb349a054d8..d142c316c7d1 100644
--- a/py-polars/tests/unit/io/test_delta.py
+++ b/py-polars/tests/unit/io/test_delta.py
@@ -480,6 +480,7 @@ def test_categorical_becomes_string(tmp_path: Path) -> None:
     df2 = pl.read_delta(str(tmp_path))
     assert_frame_equal(df2, pl.DataFrame({"a": ["A", "B", "A"]}, schema={"a": pl.Utf8}))
 
+
 def test_scan_delta_DT_input(delta_table_path: Path) -> None:
     DT = DeltaTable(str(delta_table_path), version=0)
     ldf = pl.scan_delta(DT)

From b2c4276222e9bccf34ae829962b8e16f4185a971 Mon Sep 17 00:00:00 2001
From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com>
Date: Wed, 13 Nov 2024 19:57:44 +0100
Subject: [PATCH 08/10] chore: add use_pyarrow flag

---
 py-polars/polars/io/delta.py | 58 ++++++++++++++----------------------
 1 file changed, 23 insertions(+), 35 deletions(-)

diff --git a/py-polars/polars/io/delta.py b/py-polars/polars/io/delta.py
index 40212d160a48..a77e1175e09b 100644
--- a/py-polars/polars/io/delta.py
+++ b/py-polars/polars/io/delta.py
@@ -6,7 +6,6 @@
 from typing import TYPE_CHECKING, Any
 from urllib.parse import urlparse
 
-from polars._utils.deprecation import issue_deprecation_warning
 from polars.convert import from_arrow
 from polars.datatypes import Null, Time
 from polars.datatypes.convert import unpack_dtypes
@@ -26,9 +25,10 @@ def read_delta(
     *,
     version: int | str | datetime | None = None,
     columns: list[str] | None = None,
-    rechunk: bool | None = None,
+    rechunk: bool = False,
     storage_options: dict[str, Any] | None = None,
     delta_table_options: dict[str, Any] | None = None,
+    use_pyarrow: bool = False,
     pyarrow_options: dict[str, Any] | None = None,
 ) -> DataFrame:
     """
@@ -62,12 +62,11 @@ def read_delta(
         <https://delta-io.github.io/delta-rs/usage/loading-table/>`__.
     delta_table_options
         Additional keyword arguments while reading a Delta lake Table.
+    use_pyarrow
+        Flag to enable pyarrow dataset reads.
     pyarrow_options
         Keyword arguments while converting a Delta lake Table to pyarrow table.
 
-        .. deprecated:: 1.10.0
-            Remove pyarrow_options and use native polars filter, selection.
-
     Returns
     -------
     DataFrame
@@ -138,33 +137,14 @@ def read_delta(
     ...     table_path, delta_table_options=delta_table_options
     ... )  # doctest: +SKIP
     """
-    if pyarrow_options is not None:
-        issue_deprecation_warning(
-            message="`pyarrow_options` are deprecated, polars native parquet reader is used when not passing pyarrow options.",
-            version="1.13",
-        )
-        dl_tbl = _get_delta_lake_table(
-            table_path=source,
-            version=version,
-            storage_options=storage_options,
-            delta_table_options=delta_table_options,
-        )
-        if rechunk is None:
-            rechunk = False
-        return from_arrow(
-            dl_tbl.to_pyarrow_table(columns=columns, **pyarrow_options), rechunk=rechunk
-        )  # type: ignore[return-value]
-
-    if rechunk is not None:
-        issue_deprecation_warning(
-            message="`rechunk` is deprecated, this is automatically done now.",
-            version="1.13",
-        )
     df = scan_delta(
         source=source,
         version=version,
         storage_options=storage_options,
         delta_table_options=delta_table_options,
+        use_pyarrow=use_pyarrow,
+        pyarrow_options=pyarrow_options,
+        rechunk=rechunk,
     )
 
     if columns is not None:
@@ -178,7 +158,9 @@ def scan_delta(
     version: int | str | datetime | None = None,
     storage_options: dict[str, Any] | None = None,
     delta_table_options: dict[str, Any] | None = None,
+    use_pyarrow: bool = False,
     pyarrow_options: dict[str, Any] | None = None,
+    rechunk: bool = False,
 ) -> LazyFrame:
     """
     Lazily read from a Delta lake table.
@@ -203,13 +185,15 @@ def scan_delta(
         <https://delta-io.github.io/delta-rs/usage/loading-table/>`__.
     delta_table_options
         Additional keyword arguments while reading a Delta lake Table.
+    use_pyarrow
+        Flag to enable pyarrow dataset reads.
     pyarrow_options
         Keyword arguments while converting a Delta lake Table to pyarrow table.
         Use this parameter when filtering on partitioned columns or to read
         from a 'fsspec' supported filesystem.
-
-        .. deprecated:: 1.10.0
-            Remove pyarrow_options and use native polars filter, selection.
+    rechunk
+        Make sure that all columns are contiguous in memory by
+        aggregating the chunks into a single array.
 
     Returns
     -------
@@ -294,14 +278,15 @@ def scan_delta(
         delta_table_options=delta_table_options,
     )
 
-    if pyarrow_options is not None:
-        issue_deprecation_warning(
-            message="PyArrow options are deprecated, polars native parquet scanner is used when not passing pyarrow options.",
-            version="1.13",
-        )
+    if use_pyarrow:
+        pyarrow_options = pyarrow_options or {}
         pa_ds = dl_tbl.to_pyarrow_dataset(**pyarrow_options)
         return scan_pyarrow_dataset(pa_ds)
 
+    if pyarrow_options is not None:
+        msg = "To make use of pyarrow_options, set use_pyarrow to True"
+        raise ValueError(msg)
+
     import pyarrow as pa
     from deltalake.exceptions import DeltaProtocolError
     from deltalake.table import (
@@ -331,6 +316,8 @@ def scan_delta(
             msg = f"The table has set these reader features: {missing_features} but these are not yet supported by the polars delta scanner."
             raise DeltaProtocolError(msg)
 
+    # Requires conversion through pyarrow table because there is no direct way yet to
+    # convert a delta schema into a polars schema
     delta_schema = dl_tbl.schema().to_pyarrow(as_large_types=True)
     polars_schema = from_arrow(pa.Table.from_pylist([], delta_schema)).schema  # type: ignore[union-attr]
     partition_columns = dl_tbl.metadata().partition_columns
@@ -361,6 +348,7 @@ def _split_schema(
         allow_missing_columns=True,
         hive_partitioning=len(partition_columns) > 0,
         storage_options=storage_options,
+        rechunk=rechunk,
     )
 
 

From 2ef113fd675ee55530a88f6fcb027ca80df808a8 Mon Sep 17 00:00:00 2001
From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com>
Date: Wed, 13 Nov 2024 21:30:53 +0100
Subject: [PATCH 09/10] chore: typehint

---
 py-polars/polars/io/delta.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/py-polars/polars/io/delta.py b/py-polars/polars/io/delta.py
index a77e1175e09b..635989fb8714 100644
--- a/py-polars/polars/io/delta.py
+++ b/py-polars/polars/io/delta.py
@@ -25,7 +25,7 @@ def read_delta(
     *,
     version: int | str | datetime | None = None,
     columns: list[str] | None = None,
-    rechunk: bool = False,
+    rechunk: bool | None = None,
     storage_options: dict[str, Any] | None = None,
     delta_table_options: dict[str, Any] | None = None,
     use_pyarrow: bool = False,
@@ -160,7 +160,7 @@ def scan_delta(
     delta_table_options: dict[str, Any] | None = None,
     use_pyarrow: bool = False,
     pyarrow_options: dict[str, Any] | None = None,
-    rechunk: bool = False,
+    rechunk: bool | None = None,
 ) -> LazyFrame:
     """
     Lazily read from a Delta lake table.
@@ -348,7 +348,7 @@ def _split_schema(
         allow_missing_columns=True,
         hive_partitioning=len(partition_columns) > 0,
         storage_options=storage_options,
-        rechunk=rechunk,
+        rechunk=rechunk or False,
     )
 
 

From 6cdc5e79ed993974e8fa834c2d6b0c4ab74f3b44 Mon Sep 17 00:00:00 2001
From: ritchie <ritchie46@gmail.com>
Date: Sat, 16 Nov 2024 18:20:52 +0100
Subject: [PATCH 10/10] remove deprecation hint

---
 py-polars/polars/io/delta.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/py-polars/polars/io/delta.py b/py-polars/polars/io/delta.py
index 635989fb8714..6e9ebc7c7433 100644
--- a/py-polars/polars/io/delta.py
+++ b/py-polars/polars/io/delta.py
@@ -51,9 +51,6 @@ def read_delta(
     rechunk
         Make sure that all columns are contiguous in memory by
         aggregating the chunks into a single array.
-
-        .. deprecated:: 1.10.0
-            Rechunk is automatically done in native reader
     storage_options
         Extra options for the storage backends supported by `deltalake`.
         For cloud storages, this may include configurations for authentication etc.