Skip to content

Commit 80b8d8c

Browse files
committed
Expose DataFusion statistics on an IcebergTableScan
1 parent 4603b64 commit 80b8d8c

File tree

10 files changed

+252
-27
lines changed

10 files changed

+252
-27
lines changed

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/iceberg/src/scan.rs

Lines changed: 1 addition & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -188,27 +188,7 @@ impl<'a> TableScanBuilder<'a> {
188188

189189
/// Build the table scan.
190190
pub fn build(self) -> Result<TableScan> {
191-
let snapshot = match self.snapshot_id {
192-
Some(snapshot_id) => self
193-
.table
194-
.metadata()
195-
.snapshot_by_id(snapshot_id)
196-
.ok_or_else(|| {
197-
Error::new(
198-
ErrorKind::DataInvalid,
199-
format!("Snapshot with id {} not found", snapshot_id),
200-
)
201-
})?
202-
.clone(),
203-
None => self
204-
.table
205-
.metadata()
206-
.current_snapshot()
207-
.ok_or_else(|| {
208-
Error::new(ErrorKind::Unexpected, "Can't scan table without snapshots")
209-
})?
210-
.clone(),
211-
};
191+
let snapshot = self.table.snapshot(self.snapshot_id)?;
212192

213193
let schema = snapshot.schema(self.table.metadata())?;
214194

crates/iceberg/src/table.rs

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ use crate::io::object_cache::ObjectCache;
2424
use crate::io::FileIO;
2525
use crate::metadata_scan::MetadataTable;
2626
use crate::scan::TableScanBuilder;
27-
use crate::spec::{TableMetadata, TableMetadataRef};
27+
use crate::spec::{SnapshotRef, TableMetadata, TableMetadataRef};
2828
use crate::{Error, ErrorKind, Result, TableIdent};
2929

3030
/// Builder to create table scan.
@@ -201,6 +201,29 @@ impl Table {
201201
TableScanBuilder::new(self)
202202
}
203203

204+
/// Get the specified or latest snapshot for this table
205+
pub fn snapshot(&self, snapshot_id: Option<i64>) -> Result<SnapshotRef> {
206+
Ok(match snapshot_id {
207+
Some(snapshot_id) => self
208+
.metadata()
209+
.snapshot_by_id(snapshot_id)
210+
.ok_or_else(|| {
211+
Error::new(
212+
ErrorKind::DataInvalid,
213+
format!("Snapshot with id {} not found", snapshot_id),
214+
)
215+
})?
216+
.clone(),
217+
None => self
218+
.metadata()
219+
.current_snapshot()
220+
.ok_or_else(|| {
221+
Error::new(ErrorKind::Unexpected, "Can't scan table without snapshots")
222+
})?
223+
.clone(),
224+
})
225+
}
226+
204227
/// Creates a metadata table which provides table-like APIs for inspecting metadata.
205228
/// See [`MetadataTable`] for more details.
206229
pub fn metadata_table(self) -> MetadataTable {

crates/integration_tests/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,11 @@ rust-version = { workspace = true }
2727
[dependencies]
2828
arrow-array = { workspace = true }
2929
arrow-schema = { workspace = true }
30+
datafusion = "44"
3031
futures = { workspace = true }
3132
iceberg = { workspace = true }
3233
iceberg-catalog-rest = { workspace = true }
34+
iceberg-datafusion = { workspace = true }
3335
iceberg_test_utils = { path = "../test_utils", features = ["tests"] }
3436
parquet = { workspace = true }
3537
tokio = { workspace = true }
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use datafusion::common::stats::Precision;
19+
use datafusion::common::{ColumnStatistics, ScalarValue, Statistics};
20+
use iceberg::{Catalog, Result, TableIdent};
21+
use iceberg_datafusion::compute_statistics;
22+
use iceberg_integration_tests::set_test_fixture;
23+
24+
#[tokio::test]
25+
async fn test_statistics() -> Result<()> {
26+
let fixture = set_test_fixture("datafusion_statistics").await;
27+
28+
let catalog = fixture.rest_catalog;
29+
30+
let table = catalog
31+
.load_table(
32+
&TableIdent::from_strs(["default", "test_positional_merge_on_read_double_deletes"])
33+
.unwrap(),
34+
)
35+
.await
36+
.unwrap();
37+
38+
let stats = compute_statistics(&table, None).await?;
39+
40+
assert_eq!(stats, Statistics {
41+
num_rows: Precision::Inexact(14),
42+
total_byte_size: Precision::Absent,
43+
column_statistics: vec![
44+
ColumnStatistics {
45+
null_count: Precision::Inexact(0),
46+
max_value: Precision::Inexact(ScalarValue::Date32(Some(19428))),
47+
min_value: Precision::Inexact(ScalarValue::Date32(Some(19417))),
48+
distinct_count: Precision::Absent,
49+
},
50+
ColumnStatistics {
51+
null_count: Precision::Inexact(0),
52+
max_value: Precision::Inexact(ScalarValue::Int32(Some(12))),
53+
min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
54+
distinct_count: Precision::Absent,
55+
},
56+
ColumnStatistics {
57+
null_count: Precision::Inexact(0),
58+
max_value: Precision::Inexact(ScalarValue::Utf8View(Some("l".to_string()))),
59+
min_value: Precision::Inexact(ScalarValue::Utf8View(Some("a".to_string()))),
60+
distinct_count: Precision::Absent,
61+
},
62+
],
63+
});
64+
65+
Ok(())
66+
}

crates/integrations/datafusion/src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ pub use error::*;
2323

2424
mod physical_plan;
2525
mod schema;
26+
mod statistics;
2627
mod table;
28+
29+
pub use statistics::*;
2730
pub use table::table_provider_factory::IcebergTableProviderFactory;
2831
pub use table::*;

crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ use std::vec;
2020
use datafusion::logical_expr::{Expr, Operator};
2121
use datafusion::scalar::ScalarValue;
2222
use iceberg::expr::{BinaryExpression, Predicate, PredicateOperator, Reference, UnaryExpression};
23-
use iceberg::spec::Datum;
23+
use iceberg::spec::{Datum, PrimitiveLiteral, PrimitiveType};
2424

2525
// A datafusion expression could be an Iceberg predicate, column, or literal.
2626
enum TransformedResult {
@@ -195,20 +195,44 @@ const MILLIS_PER_DAY: i64 = 24 * 60 * 60 * 1000;
195195
/// Convert a scalar value to an iceberg datum.
196196
fn scalar_value_to_datum(value: &ScalarValue) -> Option<Datum> {
197197
match value {
198+
ScalarValue::Boolean(Some(v)) => Some(Datum::bool(*v)),
198199
ScalarValue::Int8(Some(v)) => Some(Datum::int(*v as i32)),
199200
ScalarValue::Int16(Some(v)) => Some(Datum::int(*v as i32)),
200201
ScalarValue::Int32(Some(v)) => Some(Datum::int(*v)),
201202
ScalarValue::Int64(Some(v)) => Some(Datum::long(*v)),
202-
ScalarValue::Float32(Some(v)) => Some(Datum::double(*v as f64)),
203+
ScalarValue::Float32(Some(v)) => Some(Datum::float(*v)),
203204
ScalarValue::Float64(Some(v)) => Some(Datum::double(*v)),
204-
ScalarValue::Utf8(Some(v)) => Some(Datum::string(v.clone())),
205-
ScalarValue::LargeUtf8(Some(v)) => Some(Datum::string(v.clone())),
205+
ScalarValue::Utf8(Some(v))
206+
| ScalarValue::Utf8View(Some(v))
207+
| ScalarValue::LargeUtf8(Some(v)) => Some(Datum::string(v.clone())),
206208
ScalarValue::Date32(Some(v)) => Some(Datum::date(*v)),
207209
ScalarValue::Date64(Some(v)) => Some(Datum::date((*v / MILLIS_PER_DAY) as i32)),
208210
_ => None,
209211
}
210212
}
211213

214+
/// Convert an iceberg datum to a datafusion scalar value.
215+
pub fn datum_to_scalar_value(datum: &Datum) -> Option<ScalarValue> {
216+
match (datum.data_type(), datum.literal()) {
217+
(PrimitiveType::Binary, PrimitiveLiteral::Boolean(v)) => {
218+
Some(ScalarValue::Boolean(Some(*v)))
219+
}
220+
(PrimitiveType::Int, PrimitiveLiteral::Int(v)) => Some(ScalarValue::Int32(Some(*v))),
221+
(PrimitiveType::Long, PrimitiveLiteral::Long(v)) => Some(ScalarValue::Int64(Some(*v))),
222+
(PrimitiveType::Float, PrimitiveLiteral::Float(v)) => {
223+
Some(ScalarValue::Float32(Some(v.into_inner())))
224+
}
225+
(PrimitiveType::Double, PrimitiveLiteral::Double(v)) => {
226+
Some(ScalarValue::Float64(Some(v.into_inner())))
227+
}
228+
(PrimitiveType::String, PrimitiveLiteral::String(v)) => {
229+
Some(ScalarValue::Utf8View(Some(v.clone())))
230+
}
231+
(PrimitiveType::Date, PrimitiveLiteral::Int(v)) => Some(ScalarValue::Date32(Some(*v))),
232+
_ => None,
233+
}
234+
}
235+
212236
#[cfg(test)]
213237
mod tests {
214238
use datafusion::arrow::datatypes::{DataType, Field, Schema};

crates/integrations/datafusion/src/physical_plan/scan.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ use std::vec;
2222

2323
use datafusion::arrow::array::RecordBatch;
2424
use datafusion::arrow::datatypes::SchemaRef as ArrowSchemaRef;
25+
use datafusion::common::Statistics;
2526
use datafusion::error::Result as DFResult;
2627
use datafusion::execution::{SendableRecordBatchStream, TaskContext};
2728
use datafusion::physical_expr::EquivalenceProperties;
@@ -44,6 +45,8 @@ pub(crate) struct IcebergTableScan {
4445
table: Table,
4546
/// Snapshot of the table to scan.
4647
snapshot_id: Option<i64>,
48+
/// Statistics for the table; row count, and null count/min-max values per column.
49+
statistics: Statistics,
4750
/// Stores certain, often expensive to compute,
4851
/// plan properties used in query optimization.
4952
plan_properties: PlanProperties,
@@ -59,6 +62,7 @@ impl IcebergTableScan {
5962
table: Table,
6063
snapshot_id: Option<i64>,
6164
schema: ArrowSchemaRef,
65+
statistics: Statistics,
6266
projection: Option<&Vec<usize>>,
6367
filters: &[Expr],
6468
) -> Self {
@@ -73,6 +77,7 @@ impl IcebergTableScan {
7377
Self {
7478
table,
7579
snapshot_id,
80+
statistics,
7681
plan_properties,
7782
projection,
7883
predicates,
@@ -135,6 +140,10 @@ impl ExecutionPlan for IcebergTableScan {
135140
stream,
136141
)))
137142
}
143+
144+
fn statistics(&self) -> DFResult<Statistics> {
145+
Ok(self.statistics.clone())
146+
}
138147
}
139148

140149
impl DisplayAs for IcebergTableScan {
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use std::collections::HashMap;
19+
20+
use datafusion::common::stats::Precision;
21+
use datafusion::common::{ColumnStatistics, Statistics};
22+
use iceberg::spec::ManifestStatus;
23+
use iceberg::table::Table;
24+
use iceberg::Result;
25+
26+
use crate::physical_plan::expr_to_predicate::datum_to_scalar_value;
27+
28+
// Compute DataFusion table statistics for a given table/snapshot
29+
pub async fn compute_statistics(table: &Table, snapshot_id: Option<i64>) -> Result<Statistics> {
30+
let file_io = table.file_io();
31+
let metadata = table.metadata();
32+
let snapshot = table.snapshot(snapshot_id)?;
33+
34+
let mut num_rows = 0;
35+
let mut lower_bounds = HashMap::new();
36+
let mut upper_bounds = HashMap::new();
37+
let mut null_counts = HashMap::new();
38+
39+
let manifest_list = snapshot.load_manifest_list(file_io, metadata).await?;
40+
41+
// For each existing/added manifest in the snapshot aggregate the row count, as well as null
42+
// count and min/max values.
43+
for manifest_file in manifest_list.entries() {
44+
let manifest = manifest_file.load_manifest(file_io).await?;
45+
manifest.entries().iter().for_each(|manifest_entry| {
46+
if manifest_entry.status() != ManifestStatus::Deleted {
47+
let data_file = manifest_entry.data_file();
48+
num_rows += data_file.record_count();
49+
data_file.lower_bounds().iter().for_each(|(col_id, min)| {
50+
lower_bounds
51+
.entry(*col_id)
52+
.and_modify(|col_min| {
53+
if min < col_min {
54+
*col_min = min.clone()
55+
}
56+
})
57+
.or_insert(min.clone());
58+
});
59+
data_file.upper_bounds().iter().for_each(|(col_id, max)| {
60+
upper_bounds
61+
.entry(*col_id)
62+
.and_modify(|col_max| {
63+
if max > col_max {
64+
*col_max = max.clone()
65+
}
66+
})
67+
.or_insert(max.clone());
68+
});
69+
data_file
70+
.null_value_counts()
71+
.iter()
72+
.for_each(|(col_id, null_count)| {
73+
null_counts
74+
.entry(*col_id)
75+
.and_modify(|col_null_count| *col_null_count += *null_count)
76+
.or_insert(*null_count);
77+
});
78+
}
79+
})
80+
}
81+
82+
// Construct the DataFusion `Statistics` object, leaving any missing info as `Precision::Absent`
83+
let schema = snapshot.schema(metadata)?;
84+
let col_stats = schema
85+
.as_struct()
86+
.fields()
87+
.iter()
88+
.map(|field| {
89+
ColumnStatistics {
90+
null_count: null_counts
91+
.get(&field.id)
92+
.map(|nc| Precision::Inexact(*nc as usize))
93+
.unwrap_or(Precision::Absent),
94+
max_value: upper_bounds
95+
.get(&field.id)
96+
.and_then(|datum| datum_to_scalar_value(datum).map(Precision::Inexact))
97+
.unwrap_or(Precision::Absent),
98+
min_value: lower_bounds
99+
.get(&field.id)
100+
.and_then(|datum| datum_to_scalar_value(datum).map(Precision::Inexact))
101+
.unwrap_or(Precision::Absent),
102+
distinct_count: Precision::Absent, // will be picked up after #417
103+
}
104+
})
105+
.collect();
106+
107+
Ok(Statistics {
108+
num_rows: Precision::Inexact(num_rows as usize),
109+
total_byte_size: Precision::Absent,
110+
column_statistics: col_stats,
111+
})
112+
}

0 commit comments

Comments
 (0)