Skip to content

Commit

Permalink
feat(python, rust): add statistics_enabled to ColumnProperties
Browse files Browse the repository at this point in the history
Signed-off-by: Max Piskunov <[email protected]>
  • Loading branch information
maxitg authored and rtyler committed Jan 15, 2025
1 parent 797888f commit 8667622
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 2 deletions.
8 changes: 7 additions & 1 deletion python/deltalake/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,22 +217,28 @@ class ColumnProperties:
def __init__(
self,
dictionary_enabled: Optional[bool] = None,
statistics_enabled: Optional[Literal["NONE", "CHUNK", "PAGE"]] = None,
max_statistics_size: Optional[int] = None,
bloom_filter_properties: Optional[BloomFilterProperties] = None,
):
"""Create a Column Properties instance for the Rust parquet writer:
Args:
dictionary_enabled: Enable dictionary encoding for the column.
statistics_enabled: Statistics level for the column.
max_statistics_size: Maximum size of statistics for the column.
bloom_filter_properties: Bloom Filter Properties for the column.
"""
self.dictionary_enabled = dictionary_enabled
self.statistics_enabled = statistics_enabled
self.max_statistics_size = max_statistics_size
self.bloom_filter_properties = bloom_filter_properties

def __str__(self) -> str:
return f"dictionary_enabled: {self.dictionary_enabled}, max_statistics_size: {self.max_statistics_size}, bloom_filter_properties: {self.bloom_filter_properties}"
return (
f"dictionary_enabled: {self.dictionary_enabled}, statistics_enabled: {self.statistics_enabled}, "
f"max_statistics_size: {self.max_statistics_size}, bloom_filter_properties: {self.bloom_filter_properties}"
)


@dataclass(init=True)
Expand Down
20 changes: 19 additions & 1 deletion python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ use deltalake::operations::vacuum::VacuumBuilder;
use deltalake::operations::{collect_sendable_stream, CustomExecuteHandler};
use deltalake::parquet::basic::Compression;
use deltalake::parquet::errors::ParquetError;
use deltalake::parquet::file::properties::WriterProperties;
use deltalake::parquet::file::properties::{EnabledStatistics, WriterProperties};
use deltalake::partitions::PartitionFilter;
use deltalake::protocol::{DeltaOperation, SaveMode};
use deltalake::storage::{IORuntime, ObjectStoreRef};
Expand Down Expand Up @@ -1566,6 +1566,13 @@ fn set_writer_properties(writer_properties: PyWriterProperties) -> DeltaResult<W
if let Some(dictionary_enabled) = default_column_properties.dictionary_enabled {
properties = properties.set_dictionary_enabled(dictionary_enabled);
}
if let Some(statistics_enabled) = default_column_properties.statistics_enabled {
let enabled_statistics: EnabledStatistics = statistics_enabled
.parse()
.map_err(|err: String| DeltaTableError::Generic(err))?;

properties = properties.set_statistics_enabled(enabled_statistics);
}
if let Some(max_statistics_size) = default_column_properties.max_statistics_size {
properties = properties.set_max_statistics_size(max_statistics_size);
}
Expand All @@ -1591,6 +1598,16 @@ fn set_writer_properties(writer_properties: PyWriterProperties) -> DeltaResult<W
dictionary_enabled,
);
}
if let Some(statistics_enabled) = column_prop.statistics_enabled {
let enabled_statistics: EnabledStatistics = statistics_enabled
.parse()
.map_err(|err: String| DeltaTableError::Generic(err))?;

properties = properties.set_column_statistics_enabled(
column_name.clone().into(),
enabled_statistics,
);
}
if let Some(bloom_filter_properties) = column_prop.bloom_filter_properties {
if let Some(set_bloom_filter_enabled) =
bloom_filter_properties.set_bloom_filter_enabled
Expand Down Expand Up @@ -1919,6 +1936,7 @@ pub struct BloomFilterProperties {
#[derive(FromPyObject)]
pub struct ColumnProperties {
pub dictionary_enabled: Option<bool>,
pub statistics_enabled: Option<String>,
pub max_statistics_size: Option<usize>,
pub bloom_filter_properties: Option<BloomFilterProperties>,
}
Expand Down
2 changes: 2 additions & 0 deletions python/tests/test_writerproperties.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,15 @@ def test_writer_properties_all_filled():
column_properties={
"a": ColumnProperties(
dictionary_enabled=True,
statistics_enabled="CHUNK",
max_statistics_size=40,
bloom_filter_properties=BloomFilterProperties(
set_bloom_filter_enabled=True, fpp=0.2, ndv=30
),
),
"b": ColumnProperties(
dictionary_enabled=True,
statistics_enabled="PAGE",
max_statistics_size=400,
bloom_filter_properties=BloomFilterProperties(
set_bloom_filter_enabled=False, fpp=0.2, ndv=30
Expand Down

0 comments on commit 8667622

Please sign in to comment.