From 8667622df55e6807cffb13f56dd293c0db4ee280 Mon Sep 17 00:00:00 2001 From: Max Piskunov Date: Mon, 13 Jan 2025 23:47:49 +0000 Subject: [PATCH] feat(python, rust): add statistics_enabled to ColumnProperties Signed-off-by: Max Piskunov --- python/deltalake/table.py | 8 +++++++- python/src/lib.rs | 20 +++++++++++++++++++- python/tests/test_writerproperties.py | 2 ++ 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/python/deltalake/table.py b/python/deltalake/table.py index d4e4dd192e..caafd2eb21 100644 --- a/python/deltalake/table.py +++ b/python/deltalake/table.py @@ -217,6 +217,7 @@ class ColumnProperties: def __init__( self, dictionary_enabled: Optional[bool] = None, + statistics_enabled: Optional[Literal["NONE", "CHUNK", "PAGE"]] = None, max_statistics_size: Optional[int] = None, bloom_filter_properties: Optional[BloomFilterProperties] = None, ): @@ -224,15 +225,20 @@ def __init__( Args: dictionary_enabled: Enable dictionary encoding for the column. + statistics_enabled: Statistics level for the column. max_statistics_size: Maximum size of statistics for the column. bloom_filter_properties: Bloom Filter Properties for the column. """ self.dictionary_enabled = dictionary_enabled + self.statistics_enabled = statistics_enabled self.max_statistics_size = max_statistics_size self.bloom_filter_properties = bloom_filter_properties def __str__(self) -> str: - return f"dictionary_enabled: {self.dictionary_enabled}, max_statistics_size: {self.max_statistics_size}, bloom_filter_properties: {self.bloom_filter_properties}" + return ( + f"dictionary_enabled: {self.dictionary_enabled}, statistics_enabled: {self.statistics_enabled}, " + f"max_statistics_size: {self.max_statistics_size}, bloom_filter_properties: {self.bloom_filter_properties}" + ) @dataclass(init=True) diff --git a/python/src/lib.rs b/python/src/lib.rs index 8ea08158e8..a4551bf641 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -52,7 +52,7 @@ use deltalake::operations::vacuum::VacuumBuilder; use deltalake::operations::{collect_sendable_stream, CustomExecuteHandler}; use deltalake::parquet::basic::Compression; use deltalake::parquet::errors::ParquetError; -use deltalake::parquet::file::properties::WriterProperties; +use deltalake::parquet::file::properties::{EnabledStatistics, WriterProperties}; use deltalake::partitions::PartitionFilter; use deltalake::protocol::{DeltaOperation, SaveMode}; use deltalake::storage::{IORuntime, ObjectStoreRef}; @@ -1566,6 +1566,13 @@ fn set_writer_properties(writer_properties: PyWriterProperties) -> DeltaResult DeltaResult, + pub statistics_enabled: Option, pub max_statistics_size: Option, pub bloom_filter_properties: Option, } diff --git a/python/tests/test_writerproperties.py b/python/tests/test_writerproperties.py index 30c25548ad..a9a8db5868 100644 --- a/python/tests/test_writerproperties.py +++ b/python/tests/test_writerproperties.py @@ -28,6 +28,7 @@ def test_writer_properties_all_filled(): column_properties={ "a": ColumnProperties( dictionary_enabled=True, + statistics_enabled="CHUNK", max_statistics_size=40, bloom_filter_properties=BloomFilterProperties( set_bloom_filter_enabled=True, fpp=0.2, ndv=30 @@ -35,6 +36,7 @@ def test_writer_properties_all_filled(): ), "b": ColumnProperties( dictionary_enabled=True, + statistics_enabled="PAGE", max_statistics_size=400, bloom_filter_properties=BloomFilterProperties( set_bloom_filter_enabled=False, fpp=0.2, ndv=30