Skip to content

Commit

Permalink
use a single row_count column during predicate pruning instead of one…
Browse files Browse the repository at this point in the history
… per column
  • Loading branch information
adriangb committed Jan 25, 2025
1 parent 46101f3 commit c4ebe69
Showing 1 changed file with 45 additions and 42 deletions.
87 changes: 45 additions & 42 deletions datafusion/physical-optimizer/src/pruning.rs
Original file line number Diff line number Diff line change
Expand Up @@ -836,15 +836,15 @@ impl RequiredColumns {
None => (self.columns.len(), true),
};

let suffix = match stat_type {
StatisticsType::Min => "min",
StatisticsType::Max => "max",
StatisticsType::NullCount => "null_count",
StatisticsType::RowCount => "row_count",
let column_name = column.name();
let stat_column_name = match stat_type {
StatisticsType::Min => format!("{column_name}_min"),
StatisticsType::Max => format!("{column_name}_max"),
StatisticsType::NullCount => format!("{column_name}_null_count"),
StatisticsType::RowCount => "row_count".to_string(),
};

let stat_column =
phys_expr::Column::new(&format!("{}_{}", column.name(), suffix), idx);
let stat_column = phys_expr::Column::new(&stat_column_name, idx);

// only add statistics column if not previously added
if need_to_insert {
Expand Down Expand Up @@ -2233,7 +2233,7 @@ mod tests {
#[test]
fn row_group_predicate_eq() -> Result<()> {
let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]);
let expected_expr = "CASE WHEN c1_null_count@2 = c1_row_count@3 THEN false ELSE c1_min@0 <= 1 AND 1 <= c1_max@1 END";
let expected_expr = "CASE WHEN c1_null_count@2 = row_count@3 THEN false ELSE c1_min@0 <= 1 AND 1 <= c1_max@1 END";

// test column on the left
let expr = col("c1").eq(lit(1));
Expand All @@ -2253,7 +2253,7 @@ mod tests {
#[test]
fn row_group_predicate_not_eq() -> Result<()> {
let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]);
let expected_expr = "CASE WHEN c1_null_count@2 = c1_row_count@3 THEN false ELSE c1_min@0 != 1 OR 1 != c1_max@1 END";
let expected_expr = "CASE WHEN c1_null_count@2 = row_count@3 THEN false ELSE c1_min@0 != 1 OR 1 != c1_max@1 END";

// test column on the left
let expr = col("c1").not_eq(lit(1));
Expand All @@ -2274,7 +2274,7 @@ mod tests {
fn row_group_predicate_gt() -> Result<()> {
let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]);
let expected_expr =
"CASE WHEN c1_null_count@1 = c1_row_count@2 THEN false ELSE c1_max@0 > 1 END";
"CASE WHEN c1_null_count@1 = row_count@2 THEN false ELSE c1_max@0 > 1 END";

// test column on the left
let expr = col("c1").gt(lit(1));
Expand All @@ -2294,7 +2294,8 @@ mod tests {
#[test]
fn row_group_predicate_gt_eq() -> Result<()> {
let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]);
let expected_expr = "CASE WHEN c1_null_count@1 = c1_row_count@2 THEN false ELSE c1_max@0 >= 1 END";
let expected_expr =
"CASE WHEN c1_null_count@1 = row_count@2 THEN false ELSE c1_max@0 >= 1 END";

// test column on the left
let expr = col("c1").gt_eq(lit(1));
Expand All @@ -2314,7 +2315,7 @@ mod tests {
fn row_group_predicate_lt() -> Result<()> {
let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]);
let expected_expr =
"CASE WHEN c1_null_count@1 = c1_row_count@2 THEN false ELSE c1_min@0 < 1 END";
"CASE WHEN c1_null_count@1 = row_count@2 THEN false ELSE c1_min@0 < 1 END";

// test column on the left
let expr = col("c1").lt(lit(1));
Expand All @@ -2334,7 +2335,8 @@ mod tests {
#[test]
fn row_group_predicate_lt_eq() -> Result<()> {
let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]);
let expected_expr = "CASE WHEN c1_null_count@1 = c1_row_count@2 THEN false ELSE c1_min@0 <= 1 END";
let expected_expr =
"CASE WHEN c1_null_count@1 = row_count@2 THEN false ELSE c1_min@0 <= 1 END";

// test column on the left
let expr = col("c1").lt_eq(lit(1));
Expand All @@ -2360,7 +2362,7 @@ mod tests {
// test AND operator joining supported c1 < 1 expression and unsupported c2 > c3 expression
let expr = col("c1").lt(lit(1)).and(col("c2").lt(col("c3")));
let expected_expr =
"CASE WHEN c1_null_count@1 = c1_row_count@2 THEN false ELSE c1_min@0 < 1 END";
"CASE WHEN c1_null_count@1 = row_count@2 THEN false ELSE c1_min@0 < 1 END";
let predicate_expr =
test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new());
assert_eq!(predicate_expr.to_string(), expected_expr);
Expand Down Expand Up @@ -2426,7 +2428,8 @@ mod tests {
#[test]
fn row_group_predicate_lt_bool() -> Result<()> {
let schema = Schema::new(vec![Field::new("c1", DataType::Boolean, false)]);
let expected_expr = "CASE WHEN c1_null_count@1 = c1_row_count@2 THEN false ELSE c1_min@0 < true END";
let expected_expr =
"CASE WHEN c1_null_count@1 = row_count@2 THEN false ELSE c1_min@0 < true END";

// DF doesn't support arithmetic on boolean columns so
// this predicate will error when evaluated
Expand All @@ -2451,16 +2454,16 @@ mod tests {
.and(col("c2").eq(lit(2)).or(col("c2").eq(lit(3))));
let expected_expr = "\
CASE \
WHEN c1_null_count@1 = c1_row_count@2 THEN false \
WHEN c1_null_count@1 = row_count@2 THEN false \
ELSE c1_min@0 < 1 \
END \
AND (\
CASE \
WHEN c2_null_count@5 = c2_row_count@6 THEN false \
WHEN c2_null_count@5 = row_count@6 THEN false \
ELSE c2_min@3 <= 2 AND 2 <= c2_max@4 \
END \
OR CASE \
WHEN c2_null_count@5 = c2_row_count@6 THEN false \
WHEN c2_null_count@5 = row_count@6 THEN false \
ELSE c2_min@3 <= 3 AND 3 <= c2_max@4 \
END\
)";
Expand All @@ -2487,14 +2490,14 @@ mod tests {
c1_null_count_field.with_nullable(true) // could be nullable if stats are not present
)
);
// c1 < 1 should add c1_row_count
let c1_row_count_field = Field::new("c1_row_count", DataType::UInt64, false);
// c1 < 1 should add row_count
let row_count_field = Field::new("row_count", DataType::UInt64, false);
assert_eq!(
required_columns.columns[2],
(
phys_expr::Column::new("c1", 0),
StatisticsType::RowCount,
c1_row_count_field.with_nullable(true) // could be nullable if stats are not present
row_count_field.with_nullable(true) // could be nullable if stats are not present
)
);
// c2 = 2 should add c2_min and c2_max
Expand Down Expand Up @@ -2526,14 +2529,14 @@ mod tests {
c2_null_count_field.with_nullable(true) // could be nullable if stats are not present
)
);
// c2 = 2 should add c2_row_count
let c2_row_count_field = Field::new("c2_row_count", DataType::UInt64, false);
// c2 = 2 should add row_count
let row_count_field = Field::new("row_count", DataType::UInt64, false);
assert_eq!(
required_columns.columns[6],
(
phys_expr::Column::new("c2", 1),
StatisticsType::RowCount,
c2_row_count_field.with_nullable(true) // could be nullable if stats are not present
row_count_field.with_nullable(true) // could be nullable if stats are not present
)
);
// c2 = 3 shouldn't add any new statistics fields
Expand All @@ -2555,15 +2558,15 @@ mod tests {
false,
));
let expected_expr = "CASE \
WHEN c1_null_count@2 = c1_row_count@3 THEN false \
WHEN c1_null_count@2 = row_count@3 THEN false \
ELSE c1_min@0 <= 1 AND 1 <= c1_max@1 \
END \
OR CASE \
WHEN c1_null_count@2 = c1_row_count@3 THEN false \
WHEN c1_null_count@2 = row_count@3 THEN false \
ELSE c1_min@0 <= 2 AND 2 <= c1_max@1 \
END \
OR CASE \
WHEN c1_null_count@2 = c1_row_count@3 THEN false \
WHEN c1_null_count@2 = row_count@3 THEN false \
ELSE c1_min@0 <= 3 AND 3 <= c1_max@1 \
END";
let predicate_expr =
Expand Down Expand Up @@ -2603,15 +2606,15 @@ mod tests {
));
let expected_expr = "\
CASE \
WHEN c1_null_count@2 = c1_row_count@3 THEN false \
WHEN c1_null_count@2 = row_count@3 THEN false \
ELSE c1_min@0 != 1 OR 1 != c1_max@1 \
END \
AND CASE \
WHEN c1_null_count@2 = c1_row_count@3 THEN false \
WHEN c1_null_count@2 = row_count@3 THEN false \
ELSE c1_min@0 != 2 OR 2 != c1_max@1 \
END \
AND CASE \
WHEN c1_null_count@2 = c1_row_count@3 THEN false \
WHEN c1_null_count@2 = row_count@3 THEN false \
ELSE c1_min@0 != 3 OR 3 != c1_max@1 \
END";
let predicate_expr =
Expand Down Expand Up @@ -2662,19 +2665,19 @@ mod tests {
let expected_expr = "\
(\
CASE \
WHEN c1_null_count@2 = c1_row_count@3 THEN false \
WHEN c1_null_count@2 = row_count@3 THEN false \
ELSE c1_min@0 <= 1 AND 1 <= c1_max@1 \
END \
OR CASE \
WHEN c1_null_count@2 = c1_row_count@3 THEN false \
WHEN c1_null_count@2 = row_count@3 THEN false \
ELSE c1_min@0 <= 2 AND 2 <= c1_max@1 \
END\
) AND CASE \
WHEN c2_null_count@5 = c2_row_count@6 THEN false \
WHEN c2_null_count@5 = row_count@6 THEN false \
ELSE c2_max@4 >= 4 \
END \
AND CASE \
WHEN c2_null_count@5 = c2_row_count@6 THEN false \
WHEN c2_null_count@5 = row_count@6 THEN false \
ELSE c2_min@7 <= 5 \
END";
let predicate_expr =
Expand Down Expand Up @@ -2704,7 +2707,7 @@ mod tests {
fn row_group_predicate_cast() -> Result<()> {
let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]);
let expected_expr = "CASE \
WHEN c1_null_count@2 = c1_row_count@3 THEN false \
WHEN c1_null_count@2 = row_count@3 THEN false \
ELSE CAST(c1_min@0 AS Int64) <= 1 AND 1 <= CAST(c1_max@1 AS Int64) \
END";

Expand All @@ -2722,7 +2725,7 @@ mod tests {
assert_eq!(predicate_expr.to_string(), expected_expr);

let expected_expr = "CASE \
WHEN c1_null_count@1 = c1_row_count@2 THEN false \
WHEN c1_null_count@1 = row_count@2 THEN false \
ELSE TRY_CAST(c1_max@0 AS Int64) > 1 \
END";

Expand Down Expand Up @@ -2757,15 +2760,15 @@ mod tests {
false,
));
let expected_expr = "CASE \
WHEN c1_null_count@2 = c1_row_count@3 THEN false \
WHEN c1_null_count@2 = row_count@3 THEN false \
ELSE CAST(c1_min@0 AS Int64) <= 1 AND 1 <= CAST(c1_max@1 AS Int64) \
END \
OR CASE \
WHEN c1_null_count@2 = c1_row_count@3 THEN false \
WHEN c1_null_count@2 = row_count@3 THEN false \
ELSE CAST(c1_min@0 AS Int64) <= 2 AND 2 <= CAST(c1_max@1 AS Int64) \
END \
OR CASE \
WHEN c1_null_count@2 = c1_row_count@3 THEN false \
WHEN c1_null_count@2 = row_count@3 THEN false \
ELSE CAST(c1_min@0 AS Int64) <= 3 AND 3 <= CAST(c1_max@1 AS Int64) \
END";
let predicate_expr =
Expand All @@ -2782,15 +2785,15 @@ mod tests {
true,
));
let expected_expr = "CASE \
WHEN c1_null_count@2 = c1_row_count@3 THEN false \
WHEN c1_null_count@2 = row_count@3 THEN false \
ELSE CAST(c1_min@0 AS Int64) != 1 OR 1 != CAST(c1_max@1 AS Int64) \
END \
AND CASE \
WHEN c1_null_count@2 = c1_row_count@3 THEN false \
WHEN c1_null_count@2 = row_count@3 THEN false \
ELSE CAST(c1_min@0 AS Int64) != 2 OR 2 != CAST(c1_max@1 AS Int64) \
END \
AND CASE \
WHEN c1_null_count@2 = c1_row_count@3 THEN false \
WHEN c1_null_count@2 = row_count@3 THEN false \
ELSE CAST(c1_min@0 AS Int64) != 3 OR 3 != CAST(c1_max@1 AS Int64) \
END";
let predicate_expr =
Expand Down

0 comments on commit c4ebe69

Please sign in to comment.