From 841619ec6f5195892419e8442d6194377e97776f Mon Sep 17 00:00:00 2001 From: Jonas Irgens Kylling Date: Wed, 18 Dec 2024 20:26:12 +0100 Subject: [PATCH] Use projected_table_schema for projection in DeltaSchemaAdapter After upgrading from deltalake 0.20.1 to 0.22.3 it looks like Parquet column projection is broken when using DeltaTable::scan. Instead of scanning only the a single column, it looks like all columns are fetched from storage. Inspection with a debugger revelas that the adapted_projections are wrong here: https://github.com/apache/datafusion/blob/88f58bf929167c5c5e2250ad87caa88d4dff11e5/datafusion/core/src/datasource/physical_plan/parquet/opener.rs#L153-L159 The adapted_projections are obtained in https://github.com/delta-io/delta-rs/blob/5b2f46b06e0eb508f932a8b39feb11b568a78a32/crates/core/src/delta_datafusion/schema_adapter.rs#L46-L60 Changing line 49 to use the projected_table_schema seems to solve the problem. --- crates/core/src/delta_datafusion/schema_adapter.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/crates/core/src/delta_datafusion/schema_adapter.rs b/crates/core/src/delta_datafusion/schema_adapter.rs index 5b85af9a60..a792c2d285 100644 --- a/crates/core/src/delta_datafusion/schema_adapter.rs +++ b/crates/core/src/delta_datafusion/schema_adapter.rs @@ -46,7 +46,12 @@ impl SchemaAdapter for DeltaSchemaAdapter { let mut projection = Vec::with_capacity(file_schema.fields().len()); for (file_idx, file_field) in file_schema.fields.iter().enumerate() { - if self.table_schema.fields().find(file_field.name()).is_some() { + if self + .projected_table_schema + .fields() + .find(file_field.name()) + .is_some() + { projection.push(file_idx); } }