From 6d8774a1bd87e6983f367d37111e6e2d78b725a5 Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Sun, 2 Nov 2025 16:59:20 +0100 Subject: [PATCH 1/6] Add first draft of extension type registry --- Cargo.lock | 2 + Cargo.toml | 1 + datafusion/common/Cargo.toml | 1 + datafusion/common/src/types/canonical.rs | 182 ++++++++++++++++++++++ datafusion/common/src/types/extensions.rs | 57 +++++++ datafusion/common/src/types/logical.rs | 7 +- datafusion/common/src/types/mod.rs | 4 + datafusion/common/src/types/native.rs | 10 +- datafusion/expr/Cargo.toml | 1 + datafusion/expr/src/registry.rs | 138 ++++++++++++++++ datafusion/functions/Cargo.toml | 2 +- 11 files changed, 401 insertions(+), 4 deletions(-) create mode 100644 datafusion/common/src/types/canonical.rs create mode 100644 datafusion/common/src/types/extensions.rs diff --git a/Cargo.lock b/Cargo.lock index 1c516277c38a..cfcfc15695b3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2016,6 +2016,7 @@ dependencies = [ "apache-avro", "arrow", "arrow-ipc", + "arrow-schema", "chrono", "half", "hashbrown 0.14.5", @@ -2255,6 +2256,7 @@ name = "datafusion-expr" version = "50.3.0" dependencies = [ "arrow", + "arrow-schema", "async-trait", "chrono", "ctor", diff --git a/Cargo.toml b/Cargo.toml index 406ed29d3511..e56cc8a3c8f7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -182,6 +182,7 @@ testcontainers = { version = "0.25.2", features = ["default"] } testcontainers-modules = { version = "0.13" } tokio = { version = "1.48", features = ["macros", "rt", "sync"] } url = "2.5.7" +uuid = { version = "1.18", features = ["v4"] } [workspace.lints.clippy] # Detects large stack-allocated futures that may cause stack overflow crashes (see threshold in clippy.toml) diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index abeb4e66a269..53a916319677 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -61,6 +61,7 @@ apache-avro = { version = "0.20", default-features = false, features = [ ], optional = true } arrow = { workspace = true } arrow-ipc = { workspace = true } +arrow-schema = { workspace = true } chrono = { workspace = true } half = { workspace = true } hashbrown = { workspace = true } diff --git a/datafusion/common/src/types/canonical.rs b/datafusion/common/src/types/canonical.rs new file mode 100644 index 000000000000..d769360f845e --- /dev/null +++ b/datafusion/common/src/types/canonical.rs @@ -0,0 +1,182 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::types::{ + LogicalType, NativeType, TypeParameter, TypeSignature, ValuePrettyPrinter, +}; +use crate::Result; +use crate::ScalarValue; +use std::sync::{Arc, LazyLock}; + +/// Represents the canonical [UUID extension type](https://arrow.apache.org/docs/format/CanonicalExtensions.html#uuid). +pub struct UuidType {} + +impl UuidType { + /// Creates a new [UuidType]. + pub fn new() -> Self { + Self {} + } +} + +impl Default for UuidType { + fn default() -> Self { + Self::new() + } +} + +impl LogicalType for UuidType { + fn native(&self) -> &NativeType { + &NativeType::FixedSizeBinary(16) + } + + fn signature(&self) -> TypeSignature<'_> { + TypeSignature::Extension { + name: "arrow.uuid", + parameters: vec![], + } + } + + fn pretty_printer(&self) -> &Arc { + static PRETTY_PRINTER: LazyLock> = + LazyLock::new(|| Arc::new(UuidValuePrettyPrinter {})); + &PRETTY_PRINTER + } +} + +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)] +struct UuidValuePrettyPrinter; + +impl ValuePrettyPrinter for UuidValuePrettyPrinter { + fn pretty_print_scalar(&self, value: &ScalarValue) -> Result { + Ok(format!("arrow.uuid({})", value)) + } +} + +/// Represents the canonical [Opaque extension type](https://arrow.apache.org/docs/format/CanonicalExtensions.html#opaque). +/// +/// In the context of DataFusion, a common use case of the opaque type is when an extension type +/// is unknown to DataFusion. Contrary to [UnresolvedExtensionType], the extension type has +/// already been checked against the extension type registry and was not found. +pub struct OpaqueType { + /// The underlying native type. + native_type: NativeType, +} + +impl OpaqueType { + /// Creates a new [OpaqueType]. + pub fn new(native_type: NativeType) -> Self { + Self { native_type } + } +} + +impl LogicalType for OpaqueType { + fn native(&self) -> &NativeType { + &NativeType::FixedSizeBinary(16) + } + + fn signature(&self) -> TypeSignature<'_> { + let parameter = TypeParameter::Type(TypeSignature::Native(&self.native_type)); + TypeSignature::Extension { + name: "arrow.opaque", + parameters: vec![parameter], + } + } + + fn pretty_printer(&self) -> &Arc { + static PRETTY_PRINTER: LazyLock> = + LazyLock::new(|| Arc::new(OpaqueValuePrettyPrinter {})); + &PRETTY_PRINTER + } +} + +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)] +struct OpaqueValuePrettyPrinter; + +impl ValuePrettyPrinter for OpaqueValuePrettyPrinter { + fn pretty_print_scalar(&self, value: &ScalarValue) -> Result { + Ok(format!("arrow.opaque({})", value)) + } +} + +/// Represents an unresolved extension type with a given native type and name. +/// +/// This does not necessarily indicate that DataFusion does not understand the extension type. For +/// this purpose, see [OpaqueType]. However, it does indicate that the extension type was not yet +/// checked against the extension type registry. +/// +/// This extension type exists because it is often challenging to gain access to an extension type +/// registry. Especially because extension type support is relatively new, and therefore this +/// consideration was not taken into account by users. This provides a workaround such that +/// unresolved extension types can be resolved at a later point in time where access to the registry +/// is available. +pub struct UnresolvedExtensionType { + /// The name of the underlying extension type. + name: String, + /// The metadata of the underlying extension type. + metadata: Option, + /// The underlying native type. + native_type: NativeType, +} + +impl UnresolvedExtensionType { + /// Creates a new [UnresolvedExtensionType]. + pub fn new(name: String, metadata: Option, native_type: NativeType) -> Self { + Self { + name, + metadata, + native_type, + } + } + + /// The name of the unresolved extension type. + pub fn name(&self) -> &str { + &self.name + } + + /// The metadata of the unresolved extension type. + pub fn metadata(&self) -> Option<&str> { + self.metadata.as_deref() + } +} + +impl LogicalType for UnresolvedExtensionType { + fn native(&self) -> &NativeType { + &self.native_type + } + + fn signature(&self) -> TypeSignature<'_> { + TypeSignature::Extension { + name: &"datafusion.unresolved", + parameters: vec![], + } + } + + fn pretty_printer(&self) -> &Arc { + static PRETTY_PRINTER: LazyLock> = + LazyLock::new(|| Arc::new(UnresolvedValuePrettyPrinter {})); + &PRETTY_PRINTER + } +} + +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)] +struct UnresolvedValuePrettyPrinter {} + +impl ValuePrettyPrinter for UnresolvedValuePrettyPrinter { + fn pretty_print_scalar(&self, value: &ScalarValue) -> Result { + Ok(format!("datafusion.unresolved({})", value)) + } +} diff --git a/datafusion/common/src/types/extensions.rs b/datafusion/common/src/types/extensions.rs new file mode 100644 index 000000000000..a029dac6bd37 --- /dev/null +++ b/datafusion/common/src/types/extensions.rs @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::Result; +use crate::ScalarValue; +use arrow::array::Array; +use std::fmt::Debug; + +/// Implements pretty printing for a set of types. +/// +/// For example, the default pretty-printer for a byte array might not be adequate for a UUID type, +/// which is physically stored as a fixed-length byte array. This extension allows the user to +/// override the default pretty-printer for a given type. +pub trait ValuePrettyPrinter: Debug + Sync + Send { + /// Pretty print a scalar value. + /// + /// # Error + /// + /// Will return an error if the given `df_type` is not supported by this pretty printer. + fn pretty_print_scalar(&self, value: &ScalarValue) -> Result; + + /// Pretty print a specific value of a given array. + /// + /// # Error + /// + /// Will return an error if the given `df_type` is not supported by this pretty printer. + fn pretty_print_array(&self, array: &dyn Array, index: usize) -> Result { + let value = ScalarValue::try_from_array(array, index)?; + self.pretty_print_scalar(&value) + } +} + +/// The default pretty printer. +/// +/// Uses the arrow implementation of printing values. +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)] +pub struct DefaultValuePrettyPrinter; + +impl ValuePrettyPrinter for DefaultValuePrettyPrinter { + fn pretty_print_scalar(&self, value: &ScalarValue) -> Result { + Ok(value.to_string()) + } +} diff --git a/datafusion/common/src/types/logical.rs b/datafusion/common/src/types/logical.rs index 674b1a41204d..8b05d6207474 100644 --- a/datafusion/common/src/types/logical.rs +++ b/datafusion/common/src/types/logical.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use super::NativeType; +use super::{NativeType, ValuePrettyPrinter}; use crate::error::Result; use arrow::datatypes::DataType; use core::fmt; @@ -32,7 +32,7 @@ pub enum TypeSignature<'a> { /// The `name` should contain the same value as 'ARROW:extension:name'. Extension { name: &'a str, - parameters: &'a [TypeParameter<'a>], + parameters: Vec>, }, } @@ -87,6 +87,9 @@ pub trait LogicalType: Sync + Send { fn default_cast_for(&self, origin: &DataType) -> Result { self.native().default_cast_for(origin) } + + /// Returns a pretty-printer that can format values of this type. + fn pretty_printer(&self) -> &Arc; } impl fmt::Debug for dyn LogicalType { diff --git a/datafusion/common/src/types/mod.rs b/datafusion/common/src/types/mod.rs index 2f9ce4ce0282..88eb3c8ce1f8 100644 --- a/datafusion/common/src/types/mod.rs +++ b/datafusion/common/src/types/mod.rs @@ -16,11 +16,15 @@ // under the License. mod builtin; +mod canonical; +mod extensions; mod field; mod logical; mod native; pub use builtin::*; +pub use canonical::*; +pub use extensions::*; pub use field::*; pub use logical::*; pub use native::*; diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs index 8c41701ae576..b9233805e1f0 100644 --- a/datafusion/common/src/types/native.rs +++ b/datafusion/common/src/types/native.rs @@ -17,14 +17,16 @@ use super::{ LogicalField, LogicalFieldRef, LogicalFields, LogicalType, LogicalUnionFields, - TypeSignature, + TypeSignature, ValuePrettyPrinter, }; use crate::error::{Result, _internal_err}; +use crate::types::DefaultValuePrettyPrinter; use arrow::compute::can_cast_types; use arrow::datatypes::{ DataType, Field, FieldRef, Fields, IntervalUnit, TimeUnit, UnionFields, DECIMAL128_MAX_PRECISION, DECIMAL32_MAX_PRECISION, DECIMAL64_MAX_PRECISION, }; +use std::sync::LazyLock; use std::{fmt::Display, sync::Arc}; /// Representation of a type that DataFusion can handle natively. It is a subset @@ -368,6 +370,12 @@ impl LogicalType for NativeType { } }) } + + fn pretty_printer(&self) -> &Arc { + static PRETTY_PRINTER: LazyLock> = + LazyLock::new(|| Arc::new(DefaultValuePrettyPrinter {})); + &PRETTY_PRINTER + } } // The following From, From, ... implementations are temporary diff --git a/datafusion/expr/Cargo.toml b/datafusion/expr/Cargo.toml index e6b2734cfff3..aeab7a75ba53 100644 --- a/datafusion/expr/Cargo.toml +++ b/datafusion/expr/Cargo.toml @@ -44,6 +44,7 @@ sql = ["sqlparser"] [dependencies] arrow = { workspace = true } +arrow-schema = { workspace = true } async-trait = { workspace = true } chrono = { workspace = true } datafusion-common = { workspace = true, default-features = false } diff --git a/datafusion/expr/src/registry.rs b/datafusion/expr/src/registry.rs index 9554dd68e175..d96b823b1f05 100644 --- a/datafusion/expr/src/registry.rs +++ b/datafusion/expr/src/registry.rs @@ -20,6 +20,8 @@ use crate::expr_rewriter::FunctionRewrite; use crate::planner::ExprPlanner; use crate::{AggregateUDF, ScalarUDF, UserDefinedLogicalNode, WindowUDF}; +use arrow::datatypes::DataType; +use datafusion_common::types::{LogicalTypeRef, NativeType}; use datafusion_common::{not_impl_err, plan_datafusion_err, HashMap, Result}; use std::collections::HashSet; use std::fmt::Debug; @@ -215,3 +217,139 @@ impl FunctionRegistry for MemoryFunctionRegistry { self.udwfs.keys().cloned().collect() } } + +/// TODO +pub trait ExtensionTypeRegistration: Debug { + /// TODO + fn type_name(&self) -> &str; + + /// TODO + fn create_logical_type( + &self, + data_type: DataType, + metadata: HashMap, + ) -> Result; +} + +/// TODO +type ExtensionTypeRegistrationRef = Arc; + +/// Supports registering custom [LogicalType]s, including native types. +pub trait ExtensionTypeRegistry { + /// Returns a reference to the logical type named `name`. + /// + /// Returns an error if there is no + fn extension_type(&self, name: &str) -> Result; + + /// TODO + fn create_logical_type_for( + &self, + data_type: DataType, + metadata: HashMap, + ) -> Result { + match metadata.get(arrow_schema::extension::EXTENSION_TYPE_NAME_KEY) { + None => Ok(Arc::new(NativeType::from(data_type))), + Some(name) => { + let extension_type = self.extension_type(name)?; + extension_type.create_logical_type(data_type, metadata) + } + } + } + + /// Registers a new [ExtensionTypeRegistrationRef], returning any previously registered + /// implementation. + /// + /// Returns an error if the type cannot be registered, for example, if the registry is + /// read-only. + fn register_extension_type( + &mut self, + extension_type: ExtensionTypeRegistrationRef, + ) -> Result>; + + /// Deregisters an extension type registration with the name `name`, returning the + /// implementation that was deregistered. + /// + /// Returns an error if the type cannot be deregistered, for example, if the registry is + /// read-only. + fn deregister_extension_type( + &mut self, + name: &str, + ) -> Result>; +} + +/// An [`ExtensionTypeRegistry`] that uses in memory [`HashMap`]s. +#[derive(Clone, Debug)] +pub struct MemoryExtensionTypeRegistry { + /// Holds a mapping between the name of an extension type and its logical type. + extension_types: HashMap, +} + +impl Default for MemoryExtensionTypeRegistry { + fn default() -> Self { + MemoryExtensionTypeRegistry { + extension_types: HashMap::new(), + } + } +} + +impl MemoryExtensionTypeRegistry { + /// Creates an empty [MemoryExtensionTypeRegistry]. + pub fn new() -> Self { + Self { + extension_types: HashMap::new(), + } + } + + /// Creates a new [MemoryExtensionTypeRegistry] with the provided `types`. + /// + /// # Errors + /// + /// Returns an error if one of the `types` is a native type. + pub fn new_with_types( + types: impl IntoIterator, + ) -> Result { + let extension_types = types + .into_iter() + .map(|t| (t.type_name().to_owned(), t)) + .collect(); + Ok(Self { extension_types }) + } + + /// Returns a list of all registered types. + pub fn all_extension_types(&self) -> Vec { + self.extension_types.values().cloned().collect() + } +} + +impl ExtensionTypeRegistry for MemoryExtensionTypeRegistry { + fn extension_type(&self, name: &str) -> Result { + self.extension_types + .get(name) + .ok_or_else(|| plan_datafusion_err!("Logical type not found.")) + .cloned() + } + + fn register_extension_type( + &mut self, + extension_type: ExtensionTypeRegistrationRef, + ) -> Result> { + Ok(self + .extension_types + .insert(extension_type.type_name().to_owned(), extension_type)) + } + + fn deregister_extension_type( + &mut self, + name: &str, + ) -> Result> { + Ok(self.extension_types.remove(name)) + } +} + +impl From> for MemoryExtensionTypeRegistry { + fn from(value: HashMap) -> Self { + Self { + extension_types: value, + } + } +} diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 1dbeee7159fd..96c28cc3fc69 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -83,7 +83,7 @@ rand = { workspace = true } regex = { workspace = true, optional = true } sha2 = { version = "^0.10.9", optional = true } unicode-segmentation = { version = "^1.7.1", optional = true } -uuid = { version = "1.18", features = ["v4"], optional = true } +uuid = { workspace = true, optional = true } [dev-dependencies] arrow = { workspace = true, features = ["test_utils"] } From 3d23b9f9b9cf3ac41a6e868438c5b4d273670099 Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Sat, 8 Nov 2025 12:18:04 +0100 Subject: [PATCH 2/6] Use canonical extension types from arrow --- Cargo.lock | 1 + Cargo.toml | 3 +- datafusion/common/Cargo.toml | 1 + datafusion/common/src/types/canonical.rs | 90 ++++++++++++++---------- 4 files changed, 57 insertions(+), 38 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cfcfc15695b3..e6c417d1a449 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2033,6 +2033,7 @@ dependencies = [ "recursive", "sqlparser", "tokio", + "uuid", "web-time", ] diff --git a/Cargo.toml b/Cargo.toml index e56cc8a3c8f7..bef898a29351 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -94,6 +94,7 @@ apache-avro = { version = "0.20", default-features = false } arrow = { version = "57.0.0", features = [ "prettyprint", "chrono-tz", + "canonical_extension_types" ] } arrow-buffer = { version = "57.0.0", default-features = false } arrow-flight = { version = "57.0.0", features = [ @@ -103,7 +104,7 @@ arrow-ipc = { version = "57.0.0", default-features = false, features = [ "lz4", ] } arrow-ord = { version = "57.0.0", default-features = false } -arrow-schema = { version = "57.0.0", default-features = false } +arrow-schema = { version = "57.0.0", default-features = false, features = ["canonical_extension_types"] } async-trait = "0.1.89" bigdecimal = "0.4.8" bytes = "1.10" diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 53a916319677..65d5ef7e8258 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -76,6 +76,7 @@ pyo3 = { version = "0.26", optional = true } recursive = { workspace = true, optional = true } sqlparser = { workspace = true, optional = true } tokio = { workspace = true } +uuid = { version = "1.18.1", features = ["v4"] } [target.'cfg(target_family = "wasm")'.dependencies] web-time = "1.1.0" diff --git a/datafusion/common/src/types/canonical.rs b/datafusion/common/src/types/canonical.rs index d769360f845e..e2f9c3fce55d 100644 --- a/datafusion/common/src/types/canonical.rs +++ b/datafusion/common/src/types/canonical.rs @@ -15,37 +15,24 @@ // specific language governing permissions and limitations // under the License. +use crate::error::_internal_err; use crate::types::{ LogicalType, NativeType, TypeParameter, TypeSignature, ValuePrettyPrinter, }; -use crate::Result; use crate::ScalarValue; +use crate::{Result, _internal_datafusion_err}; +use arrow_schema::extension::{ExtensionType, Opaque, Uuid}; use std::sync::{Arc, LazyLock}; +use uuid::Bytes; -/// Represents the canonical [UUID extension type](https://arrow.apache.org/docs/format/CanonicalExtensions.html#uuid). -pub struct UuidType {} - -impl UuidType { - /// Creates a new [UuidType]. - pub fn new() -> Self { - Self {} - } -} - -impl Default for UuidType { - fn default() -> Self { - Self::new() - } -} - -impl LogicalType for UuidType { +impl LogicalType for Uuid { fn native(&self) -> &NativeType { &NativeType::FixedSizeBinary(16) } fn signature(&self) -> TypeSignature<'_> { TypeSignature::Extension { - name: "arrow.uuid", + name: Uuid::NAME, parameters: vec![], } } @@ -62,7 +49,21 @@ struct UuidValuePrettyPrinter; impl ValuePrettyPrinter for UuidValuePrettyPrinter { fn pretty_print_scalar(&self, value: &ScalarValue) -> Result { - Ok(format!("arrow.uuid({})", value)) + match value { + ScalarValue::FixedSizeBinary(16, value) => match value { + Some(value) => { + let bytes = Bytes::try_from(value.as_slice()).map_err(|_| { + _internal_datafusion_err!( + "Invalid UUID bytes even though type is correct." + ) + })?; + let uuid = uuid::Uuid::from_bytes(bytes); + Ok(format!("arrow.uuid({})", uuid)) + } + None => Ok("arrow.uuid(NULL)".to_owned()), + }, + _ => _internal_err!("Wrong scalar given to "), + } } } @@ -70,28 +71,19 @@ impl ValuePrettyPrinter for UuidValuePrettyPrinter { /// /// In the context of DataFusion, a common use case of the opaque type is when an extension type /// is unknown to DataFusion. Contrary to [UnresolvedExtensionType], the extension type has -/// already been checked against the extension type registry and was not found. -pub struct OpaqueType { - /// The underlying native type. - native_type: NativeType, -} - -impl OpaqueType { - /// Creates a new [OpaqueType]. - pub fn new(native_type: NativeType) -> Self { - Self { native_type } - } -} - -impl LogicalType for OpaqueType { +/// already been checked against the extension type registry and was not found. +impl LogicalType for Opaque { fn native(&self) -> &NativeType { &NativeType::FixedSizeBinary(16) } fn signature(&self) -> TypeSignature<'_> { - let parameter = TypeParameter::Type(TypeSignature::Native(&self.native_type)); + let parameter = TypeParameter::Type(TypeSignature::Extension { + name: self.metadata().type_name(), + parameters: vec![], + }); TypeSignature::Extension { - name: "arrow.opaque", + name: Opaque::NAME, parameters: vec![parameter], } } @@ -103,6 +95,8 @@ impl LogicalType for OpaqueType { } } +// TODO Other canonical extension types. + #[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)] struct OpaqueValuePrettyPrinter; @@ -159,9 +153,13 @@ impl LogicalType for UnresolvedExtensionType { } fn signature(&self) -> TypeSignature<'_> { + let inner_type = TypeParameter::Type(TypeSignature::Extension { + name: &self.name, + parameters: vec![], + }); TypeSignature::Extension { name: &"datafusion.unresolved", - parameters: vec![], + parameters: vec![inner_type], } } @@ -180,3 +178,21 @@ impl ValuePrettyPrinter for UnresolvedValuePrettyPrinter { Ok(format!("datafusion.unresolved({})", value)) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + pub fn test_pretty_print_uuid() { + let my_uuid = uuid::Uuid::nil(); + let uuid = ScalarValue::FixedSizeBinary(16, Some(my_uuid.as_bytes().to_vec())); + + let printer = UuidValuePrettyPrinter::default(); + let pretty_printed = printer.pretty_print_scalar(&uuid).unwrap(); + assert_eq!( + pretty_printed, + "arrow.uuid(00000000-0000-0000-0000-000000000000)" + ); + } +} From 31ca29f79d10b701956e446e3fed9c7281dcedab Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Sat, 8 Nov 2025 12:18:20 +0100 Subject: [PATCH 3/6] Comments and use field in extension type registry --- datafusion/expr/src/registry.rs | 39 ++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/datafusion/expr/src/registry.rs b/datafusion/expr/src/registry.rs index d96b823b1f05..9a5739e56249 100644 --- a/datafusion/expr/src/registry.rs +++ b/datafusion/expr/src/registry.rs @@ -20,7 +20,7 @@ use crate::expr_rewriter::FunctionRewrite; use crate::planner::ExprPlanner; use crate::{AggregateUDF, ScalarUDF, UserDefinedLogicalNode, WindowUDF}; -use arrow::datatypes::DataType; +use arrow_schema::Field; use datafusion_common::types::{LogicalTypeRef, NativeType}; use datafusion_common::{not_impl_err, plan_datafusion_err, HashMap, Result}; use std::collections::HashSet; @@ -218,20 +218,27 @@ impl FunctionRegistry for MemoryFunctionRegistry { } } -/// TODO +/// The registration of an extension type. +/// +/// Implementations of this trait are responsible for *creating* instances of [LogicalType] that +/// represent the semantics of an extension type. One cannot directly register the [LogicalType] +/// instances because some extension types may have parameters that are unknown at compile time +/// (e.g., the unknown type in [Opaque](arrow_schema::extension::Opaque)). pub trait ExtensionTypeRegistration: Debug { - /// TODO + /// The name of the extension type. + /// + /// This name will be used to find the correct [ExtensionTypeRegistration] when an extension + /// type is encountered. fn type_name(&self) -> &str; - /// TODO - fn create_logical_type( - &self, - data_type: DataType, - metadata: HashMap, - ) -> Result; + /// Creates a logical type instance from the provided `field`. + /// + /// The resulting [LogicalTypeRef] should only capture the *type information*, not any other + /// metadata or nullability information that is part of the field. + fn create_logical_type(&self, field: Field) -> Result; } -/// TODO +/// A cheaply clonable pointer to an [ExtensionTypeRegistration]. type ExtensionTypeRegistrationRef = Arc; /// Supports registering custom [LogicalType]s, including native types. @@ -242,16 +249,12 @@ pub trait ExtensionTypeRegistry { fn extension_type(&self, name: &str) -> Result; /// TODO - fn create_logical_type_for( - &self, - data_type: DataType, - metadata: HashMap, - ) -> Result { - match metadata.get(arrow_schema::extension::EXTENSION_TYPE_NAME_KEY) { - None => Ok(Arc::new(NativeType::from(data_type))), + fn create_logical_type_for_field(&self, field: Field) -> Result { + match field.extension_type_name() { + None => Ok(Arc::new(NativeType::from(field.data_type()))), Some(name) => { let extension_type = self.extension_type(name)?; - extension_type.create_logical_type(data_type, metadata) + extension_type.create_logical_type(field) } } } From 2d58c762a9b7e0c601b11bda262aec99be9fe52f Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Sat, 8 Nov 2025 12:23:03 +0100 Subject: [PATCH 4/6] Clippy --- datafusion/common/src/types/canonical.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/common/src/types/canonical.rs b/datafusion/common/src/types/canonical.rs index e2f9c3fce55d..f76569636e0f 100644 --- a/datafusion/common/src/types/canonical.rs +++ b/datafusion/common/src/types/canonical.rs @@ -58,7 +58,7 @@ impl ValuePrettyPrinter for UuidValuePrettyPrinter { ) })?; let uuid = uuid::Uuid::from_bytes(bytes); - Ok(format!("arrow.uuid({})", uuid)) + Ok(format!("arrow.uuid({uuid})")) } None => Ok("arrow.uuid(NULL)".to_owned()), }, @@ -102,7 +102,7 @@ struct OpaqueValuePrettyPrinter; impl ValuePrettyPrinter for OpaqueValuePrettyPrinter { fn pretty_print_scalar(&self, value: &ScalarValue) -> Result { - Ok(format!("arrow.opaque({})", value)) + Ok(format!("arrow.opaque({value})")) } } @@ -158,7 +158,7 @@ impl LogicalType for UnresolvedExtensionType { parameters: vec![], }); TypeSignature::Extension { - name: &"datafusion.unresolved", + name: "datafusion.unresolved", parameters: vec![inner_type], } } @@ -175,7 +175,7 @@ struct UnresolvedValuePrettyPrinter {} impl ValuePrettyPrinter for UnresolvedValuePrettyPrinter { fn pretty_print_scalar(&self, value: &ScalarValue) -> Result { - Ok(format!("datafusion.unresolved({})", value)) + Ok(format!("datafusion.unresolved({value})")) } } From 7511f4a029de2bef42c920acd146b95c5180ba39 Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Sat, 8 Nov 2025 13:16:37 +0100 Subject: [PATCH 5/6] Plumbing for SessionState --- .../core/src/execution/session_state.rs | 52 +++++++++++++++- datafusion/expr/src/registry.rs | 61 +++++++++++++------ 2 files changed, 95 insertions(+), 18 deletions(-) diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 2949b17537d9..4593c50d4902 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -52,7 +52,10 @@ use datafusion_expr::expr_rewriter::FunctionRewrite; use datafusion_expr::planner::ExprPlanner; #[cfg(feature = "sql")] use datafusion_expr::planner::TypePlanner; -use datafusion_expr::registry::{FunctionRegistry, SerializerRegistry}; +use datafusion_expr::registry::{ + ExtensionTypeRegistration, ExtensionTypeRegistrationRef, ExtensionTypeRegistry, + FunctionRegistry, MemoryExtensionTypeRegistry, SerializerRegistry, +}; use datafusion_expr::simplify::SimplifyInfo; #[cfg(feature = "sql")] use datafusion_expr::TableSource; @@ -156,6 +159,8 @@ pub struct SessionState { aggregate_functions: HashMap>, /// Window functions registered in the context window_functions: HashMap>, + /// Extension types registry for extensions. + extension_types: Arc, /// Deserializer registry for extensions. serializer_registry: Arc, /// Holds registered external FileFormat implementations @@ -921,6 +926,7 @@ pub struct SessionStateBuilder { scalar_functions: Option>>, aggregate_functions: Option>>, window_functions: Option>>, + extension_types: Option>, serializer_registry: Option>, file_formats: Option>>, config: Option, @@ -958,6 +964,7 @@ impl SessionStateBuilder { scalar_functions: None, aggregate_functions: None, window_functions: None, + extension_types: None, serializer_registry: None, file_formats: None, table_options: None, @@ -1010,6 +1017,7 @@ impl SessionStateBuilder { existing.aggregate_functions.into_values().collect_vec(), ), window_functions: Some(existing.window_functions.into_values().collect_vec()), + extension_types: Some(existing.extension_types), serializer_registry: Some(existing.serializer_registry), file_formats: Some(existing.file_formats.into_values().collect_vec()), config: Some(new_config), @@ -1235,6 +1243,15 @@ impl SessionStateBuilder { self } + /// Set the map of [`ExtensionTypeRegistration`]s + pub fn with_extension_type( + mut self, + registry: Arc, + ) -> Self { + self.extension_types = Some(registry); + self + } + /// Set the [`SerializerRegistry`] pub fn with_serializer_registry( mut self, @@ -1362,6 +1379,7 @@ impl SessionStateBuilder { scalar_functions, aggregate_functions, window_functions, + extension_types, serializer_registry, file_formats, table_options, @@ -1395,6 +1413,7 @@ impl SessionStateBuilder { scalar_functions: HashMap::new(), aggregate_functions: HashMap::new(), window_functions: HashMap::new(), + extension_types: Arc::new(MemoryExtensionTypeRegistry::default()), serializer_registry: serializer_registry .unwrap_or_else(|| Arc::new(EmptySerializerRegistry)), file_formats: HashMap::new(), @@ -1463,6 +1482,10 @@ impl SessionStateBuilder { }); } + if let Some(extension_types) = extension_types { + state.extension_types = extension_types; + } + if state.config.create_default_catalog_and_schema() { let default_catalog = SessionStateDefaults::default_catalog( &state.config, @@ -1945,6 +1968,33 @@ impl FunctionRegistry for SessionState { } } +impl ExtensionTypeRegistry for SessionState { + fn extension_type( + &self, + name: &str, + ) -> datafusion_common::Result { + self.extension_types.extension_type(name) + } + + fn extension_types(&self) -> Vec> { + self.extension_types.extension_types() + } + + fn register_extension_type( + &self, + extension_type: ExtensionTypeRegistrationRef, + ) -> datafusion_common::Result> { + self.extension_types.register_extension_type(extension_type) + } + + fn deregister_extension_type( + &self, + name: &str, + ) -> datafusion_common::Result> { + self.extension_types.deregister_extension_type(name) + } +} + impl OptimizerConfig for SessionState { fn query_execution_start_time(&self) -> DateTime { self.execution_props.query_execution_start_time diff --git a/datafusion/expr/src/registry.rs b/datafusion/expr/src/registry.rs index 9a5739e56249..96c6cacf0fa2 100644 --- a/datafusion/expr/src/registry.rs +++ b/datafusion/expr/src/registry.rs @@ -25,7 +25,7 @@ use datafusion_common::types::{LogicalTypeRef, NativeType}; use datafusion_common::{not_impl_err, plan_datafusion_err, HashMap, Result}; use std::collections::HashSet; use std::fmt::Debug; -use std::sync::Arc; +use std::sync::{Arc, RwLock}; /// A registry knows how to build logical expressions out of user-defined function' names pub trait FunctionRegistry { @@ -224,7 +224,7 @@ impl FunctionRegistry for MemoryFunctionRegistry { /// represent the semantics of an extension type. One cannot directly register the [LogicalType] /// instances because some extension types may have parameters that are unknown at compile time /// (e.g., the unknown type in [Opaque](arrow_schema::extension::Opaque)). -pub trait ExtensionTypeRegistration: Debug { +pub trait ExtensionTypeRegistration: Debug + Send + Sync { /// The name of the extension type. /// /// This name will be used to find the correct [ExtensionTypeRegistration] when an extension @@ -239,16 +239,16 @@ pub trait ExtensionTypeRegistration: Debug { } /// A cheaply clonable pointer to an [ExtensionTypeRegistration]. -type ExtensionTypeRegistrationRef = Arc; +pub type ExtensionTypeRegistrationRef = Arc; /// Supports registering custom [LogicalType]s, including native types. -pub trait ExtensionTypeRegistry { +pub trait ExtensionTypeRegistry: Debug + Send + Sync { /// Returns a reference to the logical type named `name`. /// /// Returns an error if there is no fn extension_type(&self, name: &str) -> Result; - /// TODO + /// Creates a [LogicalTypeRef] from the type information in the `field`. fn create_logical_type_for_field(&self, field: Field) -> Result { match field.extension_type_name() { None => Ok(Arc::new(NativeType::from(field.data_type()))), @@ -259,13 +259,16 @@ pub trait ExtensionTypeRegistry { } } + /// Returns all registered [ExtensionTypeRegistration]. + fn extension_types(&self) -> Vec>; + /// Registers a new [ExtensionTypeRegistrationRef], returning any previously registered /// implementation. /// /// Returns an error if the type cannot be registered, for example, if the registry is /// read-only. fn register_extension_type( - &mut self, + &self, extension_type: ExtensionTypeRegistrationRef, ) -> Result>; @@ -275,7 +278,7 @@ pub trait ExtensionTypeRegistry { /// Returns an error if the type cannot be deregistered, for example, if the registry is /// read-only. fn deregister_extension_type( - &mut self, + &self, name: &str, ) -> Result>; } @@ -284,13 +287,13 @@ pub trait ExtensionTypeRegistry { #[derive(Clone, Debug)] pub struct MemoryExtensionTypeRegistry { /// Holds a mapping between the name of an extension type and its logical type. - extension_types: HashMap, + extension_types: Arc>>, } impl Default for MemoryExtensionTypeRegistry { fn default() -> Self { MemoryExtensionTypeRegistry { - extension_types: HashMap::new(), + extension_types: Arc::new(RwLock::new(HashMap::new())), } } } @@ -299,7 +302,7 @@ impl MemoryExtensionTypeRegistry { /// Creates an empty [MemoryExtensionTypeRegistry]. pub fn new() -> Self { Self { - extension_types: HashMap::new(), + extension_types: Arc::new(RwLock::new(HashMap::new())), } } @@ -314,45 +317,69 @@ impl MemoryExtensionTypeRegistry { let extension_types = types .into_iter() .map(|t| (t.type_name().to_owned(), t)) - .collect(); - Ok(Self { extension_types }) + .collect::>(); + Ok(Self { + extension_types: Arc::new(RwLock::new(extension_types)), + }) } /// Returns a list of all registered types. pub fn all_extension_types(&self) -> Vec { - self.extension_types.values().cloned().collect() + self.extension_types + .read() + .expect("Extension type registry lock poisoned") + .values() + .cloned() + .collect() } } impl ExtensionTypeRegistry for MemoryExtensionTypeRegistry { fn extension_type(&self, name: &str) -> Result { self.extension_types + .write() + .expect("Extension type registry lock poisoned") .get(name) .ok_or_else(|| plan_datafusion_err!("Logical type not found.")) .cloned() } + fn extension_types(&self) -> Vec> { + self.extension_types + .read() + .expect("Extension type registry lock poisoned") + .values() + .cloned() + .collect() + } + fn register_extension_type( - &mut self, + &self, extension_type: ExtensionTypeRegistrationRef, ) -> Result> { Ok(self .extension_types + .write() + .expect("Extension type registry lock poisoned") .insert(extension_type.type_name().to_owned(), extension_type)) } fn deregister_extension_type( - &mut self, + &self, name: &str, ) -> Result> { - Ok(self.extension_types.remove(name)) + Ok(self + .extension_types + .write() + .expect("Extension type registry lock poisoned") + .remove(name)) } } impl From> for MemoryExtensionTypeRegistry { fn from(value: HashMap) -> Self { Self { - extension_types: value, + extension_types: Arc::new(RwLock::new(value)), } } } From 9a9469a697b1d6b655796b20b4471166883653d2 Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Sat, 8 Nov 2025 13:25:04 +0100 Subject: [PATCH 6/6] Type --- datafusion/expr/src/registry.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/expr/src/registry.rs b/datafusion/expr/src/registry.rs index 96c6cacf0fa2..284e01a639d8 100644 --- a/datafusion/expr/src/registry.rs +++ b/datafusion/expr/src/registry.rs @@ -238,7 +238,7 @@ pub trait ExtensionTypeRegistration: Debug + Send + Sync { fn create_logical_type(&self, field: Field) -> Result; } -/// A cheaply clonable pointer to an [ExtensionTypeRegistration]. +/// A cheaply cloneable pointer to an [ExtensionTypeRegistration]. pub type ExtensionTypeRegistrationRef = Arc; /// Supports registering custom [LogicalType]s, including native types.