Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ExtensionType for uuid and map to parquet logical type #5822

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion arrow-schema/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ bench = false
[dependencies]
serde = { version = "1.0", default-features = false, features = ["derive", "std", "rc"], optional = true }
bitflags = { version = "2.0.0", default-features = false, optional = true }
serde_json = "1.0"

[features]
# Enable ffi support
Expand All @@ -45,5 +46,4 @@ ffi = ["bitflags"]
features = ["ffi"]

[dev-dependencies]
serde_json = "1.0"
bincode = { version = "1.3.3", default-features = false }
293 changes: 293 additions & 0 deletions arrow-schema/src/datatype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -764,6 +764,299 @@ impl DataType {
}
}

/// The metadata key for the string name identifying the custom data type.
pub const EXTENSION_TYPE_NAME_KEY: &str = "ARROW:extension:name";

/// The metadata key for a serialized representation of the ExtensionType
/// necessary to reconstruct the custom type.
pub const EXTENSION_TYPE_METADATA_KEY: &str = "ARROW:extension:metadata";

/// Extension types.
///
/// <https://arrow.apache.org/docs/format/Columnar.html#extension-types>
pub trait ExtensionType: Sized {
/// The name of this extension type.
const NAME: &'static str;

/// The supported storage types of this extension type.
fn storage_types(&self) -> &[DataType];

/// The metadata type of this extension type.
type Metadata;

/// Returns a reference to the metadata of this extension type, or `None`
/// if this extension type has no metadata.
fn metadata(&self) -> Option<&Self::Metadata>;

/// Returns the serialized representation of the metadata of this extension
/// type, or `None` if this extension type has no metadata.
fn serialized_metadata(&self) -> Option<String>;

/// Deserialize this extension type from the serialized representation of the
/// metadata of this extension. An extension type that has no metadata should
/// expect `None` for for the serialized metadata.
fn from_serialized_metadata(serialized_metadata: Option<&str>) -> Option<Self>;
}

pub(crate) trait ExtensionTypeExt: ExtensionType {
/// Returns `true` if the given data type is supported by this extension
/// type.
fn supports(&self, data_type: &DataType) -> bool {
self.storage_types().contains(data_type)
}

/// Try to extract this extension type from the given [`Field`].
///
/// This function returns `None` if extension type
/// - information is missing
/// - name does not match
/// - metadata deserialization failed
/// - does not support the data type of this field
fn try_from_field(field: &Field) -> Option<Self> {
field
.metadata()
.get(EXTENSION_TYPE_NAME_KEY)
.and_then(|name| {
(name == <Self as ExtensionType>::NAME)
.then(|| {
Self::from_serialized_metadata(
field
.metadata()
.get(EXTENSION_TYPE_METADATA_KEY)
.map(String::as_str),
)
})
.flatten()
})
.filter(|extension_type| extension_type.supports(field.data_type()))
}
}

impl<T> ExtensionTypeExt for T where T: ExtensionType {}

/// Canonical extension types.
///
/// The Arrow columnar format allows defining extension types so as to extend
/// standard Arrow data types with custom semantics. Often these semantics will
/// be specific to a system or application. However, it is beneficial to share
/// the definitions of well-known extension types so as to improve
/// interoperability between different systems integrating Arrow columnar data.
pub mod canonical_extension_types {
use serde_json::Value;

use super::{DataType, ExtensionType};

/// Canonical extension types.
#[non_exhaustive]
#[derive(Debug, Clone, PartialEq)]
pub enum CanonicalExtensionTypes {
/// The extension type for 'JSON'.
Json(Json),
/// The extension type for `UUID`.
Uuid(Uuid),
}

impl From<Json> for CanonicalExtensionTypes {
fn from(value: Json) -> Self {
CanonicalExtensionTypes::Json(value)
}
}

impl From<Uuid> for CanonicalExtensionTypes {
fn from(value: Uuid) -> Self {
CanonicalExtensionTypes::Uuid(value)
}
}

/// The extension type for `JSON`.
///
/// Extension name: `arrow.json`.
///
/// The storage type of this extension is `String` or `LargeString` or
/// `StringView`. Only UTF-8 encoded JSON as specified in [rfc8259](https://datatracker.ietf.org/doc/html/rfc8259)
/// is supported.
///
/// This type does not have any parameters.
///
/// Metadata is either an empty string or a JSON string with an empty
/// object. In the future, additional fields may be added, but they are not
/// required to interpret the array.
///
/// <https://arrow.apache.org/docs/format/CanonicalExtensions.html#json>
#[derive(Debug, Clone, PartialEq)]
pub struct Json(Value);

impl Default for Json {
fn default() -> Self {
Self(Value::String("".to_owned()))
}
}

impl ExtensionType for Json {
const NAME: &'static str = "arrow.json";

type Metadata = Value;

fn storage_types(&self) -> &[DataType] {
&[DataType::Utf8, DataType::LargeUtf8, DataType::Utf8View]
}

fn metadata(&self) -> Option<&Self::Metadata> {
Some(&self.0)
}

fn serialized_metadata(&self) -> Option<String> {
Some(self.0.to_string())
}

fn from_serialized_metadata(serialized_metadata: Option<&str>) -> Option<Self> {
serialized_metadata.and_then(|metadata| match metadata {
// Empty string
r#""""# => Some(Default::default()),
// Empty object
value => value
.parse::<Value>()
.ok()
.filter(|value| matches!(value.as_object(), Some(map) if map.is_empty()))
.map(Self),
})
}
}

/// The extension type for `UUID`.
///
/// Extension name: `arrow.uuid`.
///
/// The storage type of the extension is `FixedSizeBinary` with a length of
/// 16 bytes.
///
/// Note:
/// A specific UUID version is not required or guaranteed. This extension
/// represents UUIDs as `FixedSizeBinary(16)` with big-endian notation and
/// does not interpret the bytes in any way.
///
/// <https://arrow.apache.org/docs/format/CanonicalExtensions.html#uuid>
#[derive(Debug, Default, Clone, Copy, PartialEq)]
pub struct Uuid;

impl ExtensionType for Uuid {
const NAME: &'static str = "arrow.uuid";

type Metadata = ();

fn storage_types(&self) -> &[DataType] {
&[DataType::FixedSizeBinary(16)]
}

fn metadata(&self) -> Option<&Self::Metadata> {
None
}

fn serialized_metadata(&self) -> Option<String> {
None
}

fn from_serialized_metadata(serialized_metadata: Option<&str>) -> Option<Self> {
serialized_metadata.is_none().then_some(Self)
}
}

#[cfg(test)]
mod tests {
use std::collections::HashMap;

use serde_json::Map;

use crate::{ArrowError, Field, EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY};

use super::*;

#[test]
fn json() -> Result<(), ArrowError> {
let mut field = Field::new("", DataType::Utf8, false);
field.try_with_extension_type(Json::default())?;
assert_eq!(
field.metadata().get(EXTENSION_TYPE_METADATA_KEY),
Some(&r#""""#.to_owned())
);
assert!(field.extension_type::<Json>().is_some());

let mut field = Field::new("", DataType::LargeUtf8, false);
field.try_with_extension_type(Json(serde_json::Value::Object(Map::default())))?;
assert_eq!(
field.metadata().get(EXTENSION_TYPE_METADATA_KEY),
Some(&"{}".to_owned())
);
assert!(field.extension_type::<Json>().is_some());

let mut field = Field::new("", DataType::Utf8View, false);
field.try_with_extension_type(Json::default())?;
assert!(field.extension_type::<Json>().is_some());
assert_eq!(
field.canonical_extension_type(),
Some(CanonicalExtensionTypes::Json(Json::default()))
);
Ok(())
}

#[test]
#[should_panic(expected = "expected Utf8 or LargeUtf8 or Utf8View, found Boolean")]
fn json_bad_type() {
Field::new("", DataType::Boolean, false).with_extension_type(Json::default());
}

#[test]
fn json_bad_metadata() {
let field = Field::new("", DataType::Utf8, false).with_metadata(HashMap::from_iter([
(EXTENSION_TYPE_NAME_KEY.to_owned(), Json::NAME.to_owned()),
(EXTENSION_TYPE_METADATA_KEY.to_owned(), "1234".to_owned()),
]));
// This returns `None` now because this metadata is invalid.
assert!(field.extension_type::<Json>().is_none());
}

#[test]
fn json_missing_metadata() {
let field = Field::new("", DataType::LargeUtf8, false).with_metadata(
HashMap::from_iter([(EXTENSION_TYPE_NAME_KEY.to_owned(), Json::NAME.to_owned())]),
);
// This returns `None` now because the metadata is missing.
assert!(field.extension_type::<Json>().is_none());
}

#[test]
fn uuid() -> Result<(), ArrowError> {
let mut field = Field::new("", DataType::FixedSizeBinary(16), false);
field.try_with_extension_type(Uuid)?;
assert!(field.extension_type::<Uuid>().is_some());
assert_eq!(
field.canonical_extension_type(),
Some(CanonicalExtensionTypes::Uuid(Uuid))
);
Ok(())
}

#[test]
#[should_panic(expected = "expected FixedSizeBinary(16), found FixedSizeBinary(8)")]
fn uuid_bad_type() {
Field::new("", DataType::FixedSizeBinary(8), false).with_extension_type(Uuid);
}

#[test]
fn uuid_with_metadata() {
// Add metadata that's not expected for uuid.
let field = Field::new("", DataType::FixedSizeBinary(16), false)
.with_metadata(HashMap::from_iter([(
EXTENSION_TYPE_METADATA_KEY.to_owned(),
"".to_owned(),
)]))
.with_extension_type(Uuid);
// This returns `None` now because `Uuid` expects no metadata.
assert!(field.extension_type::<Uuid>().is_none());
}
}
}

/// The maximum precision for [DataType::Decimal128] values
pub const DECIMAL128_MAX_PRECISION: u8 = 38;

Expand Down
63 changes: 62 additions & 1 deletion arrow-schema/src/field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
// specific language governing permissions and limitations
// under the License.

use crate::canonical_extension_types::{CanonicalExtensionTypes, Json, Uuid};
use crate::error::ArrowError;
use std::cmp::Ordering;
use std::collections::HashMap;
Expand All @@ -23,7 +24,10 @@ use std::sync::Arc;

use crate::datatype::DataType;
use crate::schema::SchemaBuilder;
use crate::{Fields, UnionFields, UnionMode};
use crate::{
ExtensionType, ExtensionTypeExt, Fields, UnionFields, UnionMode, EXTENSION_TYPE_METADATA_KEY,
EXTENSION_TYPE_NAME_KEY,
};

/// A reference counted [`Field`]
pub type FieldRef = Arc<Field>;
Expand Down Expand Up @@ -337,6 +341,63 @@ impl Field {
self
}

/// Returns the given [`ExtensionType`] of this [`Field`], if set.
/// Returns `None` if this field does not have this extension type.
pub fn extension_type<E: ExtensionType>(&self) -> Option<E> {
E::try_from_field(self)
}

/// Returns the [`CanonicalExtensionTypes`] of this [`Field`], if set.
pub fn canonical_extension_type(&self) -> Option<CanonicalExtensionTypes> {
Json::try_from_field(self)
.map(Into::into)
.or(Uuid::try_from_field(self).map(Into::into))
}

/// Updates the metadata of this [`Field`] with the [`ExtensionType::NAME`]
/// and [`ExtensionType::metadata`] of the given [`ExtensionType`].
///
/// # Error
///
/// This functions returns an error if the datatype of this field does not
/// match the storage type of the given extension type.
pub fn try_with_extension_type<E: ExtensionType>(
&mut self,
extension_type: E,
) -> Result<(), ArrowError> {
if extension_type.supports(&self.data_type) {
// Insert the name
self.metadata
.insert(EXTENSION_TYPE_NAME_KEY.to_owned(), E::NAME.to_owned());
// Insert the metadata, if any
if let Some(metadata) = extension_type.serialized_metadata() {
self.metadata
.insert(EXTENSION_TYPE_METADATA_KEY.to_owned(), metadata);
}
Ok(())
} else {
Err(ArrowError::InvalidArgumentError(format!(
"storage type of extension type {} does not match field data type, expected {}, found {}",
<E as ExtensionType>::NAME,
extension_type.storage_types().iter().map(ToString::to_string).collect::<Vec<_>>().join(" or "),
self.data_type
)))
}
}

/// Updates the metadata of this [`Field`] with the [`ExtensionType::NAME`]
/// and [`ExtensionType::metadata`] of the given [`ExtensionType`].
///
/// # Panics
///
/// This functions panics if the datatype of this field does match the
/// storage type of the given extension type.
pub fn with_extension_type<E: ExtensionType>(mut self, extension_type: E) -> Self {
self.try_with_extension_type(extension_type)
.unwrap_or_else(|e| panic!("{e}"));
self
}

/// Indicates whether this [`Field`] supports null values.
#[inline]
pub const fn is_nullable(&self) -> bool {
Expand Down
Loading
Loading