diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index 628d4a683cac..711543a18677 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -36,6 +36,7 @@ bench = false [dependencies] serde = { version = "1.0", default-features = false, features = ["derive", "std", "rc"], optional = true } bitflags = { version = "2.0.0", default-features = false, optional = true } +serde_json = "1.0" [features] # Enable ffi support @@ -45,5 +46,4 @@ ffi = ["bitflags"] features = ["ffi"] [dev-dependencies] -serde_json = "1.0" bincode = { version = "1.3.3", default-features = false } diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index ff5832dfa68c..b7a326f605f3 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -764,6 +764,299 @@ impl DataType { } } +/// The metadata key for the string name identifying the custom data type. +pub const EXTENSION_TYPE_NAME_KEY: &str = "ARROW:extension:name"; + +/// The metadata key for a serialized representation of the ExtensionType +/// necessary to reconstruct the custom type. +pub const EXTENSION_TYPE_METADATA_KEY: &str = "ARROW:extension:metadata"; + +/// Extension types. +/// +/// +pub trait ExtensionType: Sized { + /// The name of this extension type. + const NAME: &'static str; + + /// The supported storage types of this extension type. + fn storage_types(&self) -> &[DataType]; + + /// The metadata type of this extension type. + type Metadata; + + /// Returns a reference to the metadata of this extension type, or `None` + /// if this extension type has no metadata. + fn metadata(&self) -> Option<&Self::Metadata>; + + /// Returns the serialized representation of the metadata of this extension + /// type, or `None` if this extension type has no metadata. + fn serialized_metadata(&self) -> Option; + + /// Deserialize this extension type from the serialized representation of the + /// metadata of this extension. An extension type that has no metadata should + /// expect `None` for for the serialized metadata. + fn from_serialized_metadata(serialized_metadata: Option<&str>) -> Option; +} + +pub(crate) trait ExtensionTypeExt: ExtensionType { + /// Returns `true` if the given data type is supported by this extension + /// type. + fn supports(&self, data_type: &DataType) -> bool { + self.storage_types().contains(data_type) + } + + /// Try to extract this extension type from the given [`Field`]. + /// + /// This function returns `None` if extension type + /// - information is missing + /// - name does not match + /// - metadata deserialization failed + /// - does not support the data type of this field + fn try_from_field(field: &Field) -> Option { + field + .metadata() + .get(EXTENSION_TYPE_NAME_KEY) + .and_then(|name| { + (name == ::NAME) + .then(|| { + Self::from_serialized_metadata( + field + .metadata() + .get(EXTENSION_TYPE_METADATA_KEY) + .map(String::as_str), + ) + }) + .flatten() + }) + .filter(|extension_type| extension_type.supports(field.data_type())) + } +} + +impl ExtensionTypeExt for T where T: ExtensionType {} + +/// Canonical extension types. +/// +/// The Arrow columnar format allows defining extension types so as to extend +/// standard Arrow data types with custom semantics. Often these semantics will +/// be specific to a system or application. However, it is beneficial to share +/// the definitions of well-known extension types so as to improve +/// interoperability between different systems integrating Arrow columnar data. +pub mod canonical_extension_types { + use serde_json::Value; + + use super::{DataType, ExtensionType}; + + /// Canonical extension types. + #[non_exhaustive] + #[derive(Debug, Clone, PartialEq)] + pub enum CanonicalExtensionTypes { + /// The extension type for 'JSON'. + Json(Json), + /// The extension type for `UUID`. + Uuid(Uuid), + } + + impl From for CanonicalExtensionTypes { + fn from(value: Json) -> Self { + CanonicalExtensionTypes::Json(value) + } + } + + impl From for CanonicalExtensionTypes { + fn from(value: Uuid) -> Self { + CanonicalExtensionTypes::Uuid(value) + } + } + + /// The extension type for `JSON`. + /// + /// Extension name: `arrow.json`. + /// + /// The storage type of this extension is `String` or `LargeString` or + /// `StringView`. Only UTF-8 encoded JSON as specified in [rfc8259](https://datatracker.ietf.org/doc/html/rfc8259) + /// is supported. + /// + /// This type does not have any parameters. + /// + /// Metadata is either an empty string or a JSON string with an empty + /// object. In the future, additional fields may be added, but they are not + /// required to interpret the array. + /// + /// + #[derive(Debug, Clone, PartialEq)] + pub struct Json(Value); + + impl Default for Json { + fn default() -> Self { + Self(Value::String("".to_owned())) + } + } + + impl ExtensionType for Json { + const NAME: &'static str = "arrow.json"; + + type Metadata = Value; + + fn storage_types(&self) -> &[DataType] { + &[DataType::Utf8, DataType::LargeUtf8, DataType::Utf8View] + } + + fn metadata(&self) -> Option<&Self::Metadata> { + Some(&self.0) + } + + fn serialized_metadata(&self) -> Option { + Some(self.0.to_string()) + } + + fn from_serialized_metadata(serialized_metadata: Option<&str>) -> Option { + serialized_metadata.and_then(|metadata| match metadata { + // Empty string + r#""""# => Some(Default::default()), + // Empty object + value => value + .parse::() + .ok() + .filter(|value| matches!(value.as_object(), Some(map) if map.is_empty())) + .map(Self), + }) + } + } + + /// The extension type for `UUID`. + /// + /// Extension name: `arrow.uuid`. + /// + /// The storage type of the extension is `FixedSizeBinary` with a length of + /// 16 bytes. + /// + /// Note: + /// A specific UUID version is not required or guaranteed. This extension + /// represents UUIDs as `FixedSizeBinary(16)` with big-endian notation and + /// does not interpret the bytes in any way. + /// + /// + #[derive(Debug, Default, Clone, Copy, PartialEq)] + pub struct Uuid; + + impl ExtensionType for Uuid { + const NAME: &'static str = "arrow.uuid"; + + type Metadata = (); + + fn storage_types(&self) -> &[DataType] { + &[DataType::FixedSizeBinary(16)] + } + + fn metadata(&self) -> Option<&Self::Metadata> { + None + } + + fn serialized_metadata(&self) -> Option { + None + } + + fn from_serialized_metadata(serialized_metadata: Option<&str>) -> Option { + serialized_metadata.is_none().then_some(Self) + } + } + + #[cfg(test)] + mod tests { + use std::collections::HashMap; + + use serde_json::Map; + + use crate::{ArrowError, Field, EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY}; + + use super::*; + + #[test] + fn json() -> Result<(), ArrowError> { + let mut field = Field::new("", DataType::Utf8, false); + field.try_with_extension_type(Json::default())?; + assert_eq!( + field.metadata().get(EXTENSION_TYPE_METADATA_KEY), + Some(&r#""""#.to_owned()) + ); + assert!(field.extension_type::().is_some()); + + let mut field = Field::new("", DataType::LargeUtf8, false); + field.try_with_extension_type(Json(serde_json::Value::Object(Map::default())))?; + assert_eq!( + field.metadata().get(EXTENSION_TYPE_METADATA_KEY), + Some(&"{}".to_owned()) + ); + assert!(field.extension_type::().is_some()); + + let mut field = Field::new("", DataType::Utf8View, false); + field.try_with_extension_type(Json::default())?; + assert!(field.extension_type::().is_some()); + assert_eq!( + field.canonical_extension_type(), + Some(CanonicalExtensionTypes::Json(Json::default())) + ); + Ok(()) + } + + #[test] + #[should_panic(expected = "expected Utf8 or LargeUtf8 or Utf8View, found Boolean")] + fn json_bad_type() { + Field::new("", DataType::Boolean, false).with_extension_type(Json::default()); + } + + #[test] + fn json_bad_metadata() { + let field = Field::new("", DataType::Utf8, false).with_metadata(HashMap::from_iter([ + (EXTENSION_TYPE_NAME_KEY.to_owned(), Json::NAME.to_owned()), + (EXTENSION_TYPE_METADATA_KEY.to_owned(), "1234".to_owned()), + ])); + // This returns `None` now because this metadata is invalid. + assert!(field.extension_type::().is_none()); + } + + #[test] + fn json_missing_metadata() { + let field = Field::new("", DataType::LargeUtf8, false).with_metadata( + HashMap::from_iter([(EXTENSION_TYPE_NAME_KEY.to_owned(), Json::NAME.to_owned())]), + ); + // This returns `None` now because the metadata is missing. + assert!(field.extension_type::().is_none()); + } + + #[test] + fn uuid() -> Result<(), ArrowError> { + let mut field = Field::new("", DataType::FixedSizeBinary(16), false); + field.try_with_extension_type(Uuid)?; + assert!(field.extension_type::().is_some()); + assert_eq!( + field.canonical_extension_type(), + Some(CanonicalExtensionTypes::Uuid(Uuid)) + ); + Ok(()) + } + + #[test] + #[should_panic(expected = "expected FixedSizeBinary(16), found FixedSizeBinary(8)")] + fn uuid_bad_type() { + Field::new("", DataType::FixedSizeBinary(8), false).with_extension_type(Uuid); + } + + #[test] + fn uuid_with_metadata() { + // Add metadata that's not expected for uuid. + let field = Field::new("", DataType::FixedSizeBinary(16), false) + .with_metadata(HashMap::from_iter([( + EXTENSION_TYPE_METADATA_KEY.to_owned(), + "".to_owned(), + )])) + .with_extension_type(Uuid); + // This returns `None` now because `Uuid` expects no metadata. + assert!(field.extension_type::().is_none()); + } + } +} + /// The maximum precision for [DataType::Decimal128] values pub const DECIMAL128_MAX_PRECISION: u8 = 38; diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index fc4852a3d37d..f16e2f9bbc05 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::canonical_extension_types::{CanonicalExtensionTypes, Json, Uuid}; use crate::error::ArrowError; use std::cmp::Ordering; use std::collections::HashMap; @@ -23,7 +24,10 @@ use std::sync::Arc; use crate::datatype::DataType; use crate::schema::SchemaBuilder; -use crate::{Fields, UnionFields, UnionMode}; +use crate::{ + ExtensionType, ExtensionTypeExt, Fields, UnionFields, UnionMode, EXTENSION_TYPE_METADATA_KEY, + EXTENSION_TYPE_NAME_KEY, +}; /// A reference counted [`Field`] pub type FieldRef = Arc; @@ -337,6 +341,63 @@ impl Field { self } + /// Returns the given [`ExtensionType`] of this [`Field`], if set. + /// Returns `None` if this field does not have this extension type. + pub fn extension_type(&self) -> Option { + E::try_from_field(self) + } + + /// Returns the [`CanonicalExtensionTypes`] of this [`Field`], if set. + pub fn canonical_extension_type(&self) -> Option { + Json::try_from_field(self) + .map(Into::into) + .or(Uuid::try_from_field(self).map(Into::into)) + } + + /// Updates the metadata of this [`Field`] with the [`ExtensionType::NAME`] + /// and [`ExtensionType::metadata`] of the given [`ExtensionType`]. + /// + /// # Error + /// + /// This functions returns an error if the datatype of this field does not + /// match the storage type of the given extension type. + pub fn try_with_extension_type( + &mut self, + extension_type: E, + ) -> Result<(), ArrowError> { + if extension_type.supports(&self.data_type) { + // Insert the name + self.metadata + .insert(EXTENSION_TYPE_NAME_KEY.to_owned(), E::NAME.to_owned()); + // Insert the metadata, if any + if let Some(metadata) = extension_type.serialized_metadata() { + self.metadata + .insert(EXTENSION_TYPE_METADATA_KEY.to_owned(), metadata); + } + Ok(()) + } else { + Err(ArrowError::InvalidArgumentError(format!( + "storage type of extension type {} does not match field data type, expected {}, found {}", + ::NAME, + extension_type.storage_types().iter().map(ToString::to_string).collect::>().join(" or "), + self.data_type + ))) + } + } + + /// Updates the metadata of this [`Field`] with the [`ExtensionType::NAME`] + /// and [`ExtensionType::metadata`] of the given [`ExtensionType`]. + /// + /// # Panics + /// + /// This functions panics if the datatype of this field does match the + /// storage type of the given extension type. + pub fn with_extension_type(mut self, extension_type: E) -> Self { + self.try_with_extension_type(extension_type) + .unwrap_or_else(|e| panic!("{e}")); + self + } + /// Indicates whether this [`Field`] supports null values. #[inline] pub const fn is_nullable(&self) -> bool { diff --git a/arrow-select/src/dictionary.rs b/arrow-select/src/dictionary.rs index 2a532600b6cc..c363b99920a7 100644 --- a/arrow-select/src/dictionary.rs +++ b/arrow-select/src/dictionary.rs @@ -315,7 +315,7 @@ mod tests { assert_eq!(merged.values.as_ref(), &expected); assert_eq!(merged.key_mappings.len(), 2); assert_eq!(&merged.key_mappings[0], &[0, 0, 0, 1, 0]); - assert_eq!(&merged.key_mappings[1], &[]); + assert_eq!(&merged.key_mappings[1], &[] as &[i32; 0]); } #[test] diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 3ed3bd24e0a8..8a15037825d0 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -23,6 +23,7 @@ //! //! The interfaces for converting arrow schema to parquet schema is coming. +use arrow_schema::canonical_extension_types::Uuid; use base64::prelude::BASE64_STANDARD; use base64::Engine; use std::collections::HashMap; @@ -471,6 +472,8 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_repetition(repetition) .with_id(id) .with_length(*length) + // If set, map arrow uuid extension type to parquet uuid logical type. + .with_logical_type(field.extension_type::().map(|_| LogicalType::Uuid)) .build() } DataType::BinaryView => Type::primitive_type_builder(name, PhysicalType::BYTE_ARRAY) @@ -1932,4 +1935,23 @@ mod tests { fn test_get_arrow_schema_from_metadata() { assert!(get_arrow_schema_from_metadata("").is_err()); } + + #[test] + fn arrow_uuid_to_parquet_uuid() -> Result<()> { + let arrow_schema = Schema::new(vec![Field::new( + "uuid", + DataType::FixedSizeBinary(16), + false, + ) + .with_extension_type(Uuid)]); + + let parquet_schema = arrow_to_parquet_schema(&arrow_schema)?; + + assert_eq!( + parquet_schema.column(0).logical_type(), + Some(LogicalType::Uuid) + ); + + Ok(()) + } }