From 79c4cf91b5ff06e05b71b2194d3cd92f9873b051 Mon Sep 17 00:00:00 2001 From: Clide Stefani <109172241+Monkwire3@users.noreply.github.com> Date: Mon, 25 Mar 2024 23:42:35 -0400 Subject: [PATCH] Split cast::dictionary into a submodule of cast --- arrow-cast/src/cast/dictionary.rs | 196 ++++++++++++++++++++++++++++++ arrow-cast/src/cast/mod.rs | 180 +-------------------------- 2 files changed, 198 insertions(+), 178 deletions(-) create mode 100644 arrow-cast/src/cast/dictionary.rs diff --git a/arrow-cast/src/cast/dictionary.rs b/arrow-cast/src/cast/dictionary.rs new file mode 100644 index 000000000000..244e101f1d8d --- /dev/null +++ b/arrow-cast/src/cast/dictionary.rs @@ -0,0 +1,196 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::cast::*; + +/// Attempts to cast an `ArrayDictionary` with index type K into +/// `to_type` for supported types. +/// +/// K is the key type +pub(crate) fn dictionary_cast( + array: &dyn Array, + to_type: &DataType, + cast_options: &CastOptions, +) -> Result { + use DataType::*; + + match to_type { + Dictionary(to_index_type, to_value_type) => { + let dict_array = array + .as_any() + .downcast_ref::>() + .ok_or_else(|| { + ArrowError::ComputeError( + "Internal Error: Cannot cast dictionary to DictionaryArray of expected type".to_string(), + ) + })?; + + let keys_array: ArrayRef = + Arc::new(PrimitiveArray::::from(dict_array.keys().to_data())); + let values_array = dict_array.values(); + let cast_keys = cast_with_options(&keys_array, to_index_type, cast_options)?; + let cast_values = cast_with_options(values_array, to_value_type, cast_options)?; + + // Failure to cast keys (because they don't fit in the + // target type) results in NULL values; + if cast_keys.null_count() > keys_array.null_count() { + return Err(ArrowError::ComputeError(format!( + "Could not convert {} dictionary indexes from {:?} to {:?}", + cast_keys.null_count() - keys_array.null_count(), + keys_array.data_type(), + to_index_type + ))); + } + + let data = cast_keys.into_data(); + let builder = data + .into_builder() + .data_type(to_type.clone()) + .child_data(vec![cast_values.into_data()]); + + // Safety + // Cast keys are still valid + let data = unsafe { builder.build_unchecked() }; + + // create the appropriate array type + let new_array: ArrayRef = match **to_index_type { + Int8 => Arc::new(DictionaryArray::::from(data)), + Int16 => Arc::new(DictionaryArray::::from(data)), + Int32 => Arc::new(DictionaryArray::::from(data)), + Int64 => Arc::new(DictionaryArray::::from(data)), + UInt8 => Arc::new(DictionaryArray::::from(data)), + UInt16 => Arc::new(DictionaryArray::::from(data)), + UInt32 => Arc::new(DictionaryArray::::from(data)), + UInt64 => Arc::new(DictionaryArray::::from(data)), + _ => { + return Err(ArrowError::CastError(format!( + "Unsupported type {to_index_type:?} for dictionary index" + ))); + } + }; + + Ok(new_array) + } + _ => unpack_dictionary::(array, to_type, cast_options), + } +} + +// Unpack a dictionary where the keys are of type into a flattened array of type to_type +pub(crate) fn unpack_dictionary( + array: &dyn Array, + to_type: &DataType, + cast_options: &CastOptions, +) -> Result +where + K: ArrowDictionaryKeyType, +{ + let dict_array = array.as_dictionary::(); + let cast_dict_values = cast_with_options(dict_array.values(), to_type, cast_options)?; + take(cast_dict_values.as_ref(), dict_array.keys(), None) +} + +/// Attempts to encode an array into an `ArrayDictionary` with index +/// type K and value (dictionary) type value_type +/// +/// K is the key type +pub(crate) fn cast_to_dictionary( + array: &dyn Array, + dict_value_type: &DataType, + cast_options: &CastOptions, +) -> Result { + use DataType::*; + + match *dict_value_type { + Int8 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), + Int16 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), + Int32 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), + Int64 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), + UInt8 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), + UInt16 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), + UInt32 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), + UInt64 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), + Decimal128(_, _) => { + pack_numeric_to_dictionary::(array, dict_value_type, cast_options) + } + Decimal256(_, _) => { + pack_numeric_to_dictionary::(array, dict_value_type, cast_options) + } + Utf8 => pack_byte_to_dictionary::>(array, cast_options), + LargeUtf8 => pack_byte_to_dictionary::>(array, cast_options), + Binary => pack_byte_to_dictionary::>(array, cast_options), + LargeBinary => pack_byte_to_dictionary::>(array, cast_options), + _ => Err(ArrowError::CastError(format!( + "Unsupported output type for dictionary packing: {dict_value_type:?}" + ))), + } +} + +// Packs the data from the primitive array of type to a +// DictionaryArray with keys of type K and values of value_type V +pub(crate) fn pack_numeric_to_dictionary( + array: &dyn Array, + dict_value_type: &DataType, + cast_options: &CastOptions, +) -> Result +where + K: ArrowDictionaryKeyType, + V: ArrowPrimitiveType, +{ + // attempt to cast the source array values to the target value type (the dictionary values type) + let cast_values = cast_with_options(array, dict_value_type, cast_options)?; + let values = cast_values.as_primitive::(); + + let mut b = PrimitiveDictionaryBuilder::::with_capacity(values.len(), values.len()); + + // copy each element one at a time + for i in 0..values.len() { + if values.is_null(i) { + b.append_null(); + } else { + b.append(values.value(i))?; + } + } + Ok(Arc::new(b.finish())) +} + +// Packs the data as a GenericByteDictionaryBuilder, if possible, with the +// key types of K +pub(crate) fn pack_byte_to_dictionary( + array: &dyn Array, + cast_options: &CastOptions, +) -> Result +where + K: ArrowDictionaryKeyType, + T: ByteArrayType, +{ + let cast_values = cast_with_options(array, &T::DATA_TYPE, cast_options)?; + let values = cast_values + .as_any() + .downcast_ref::>() + .unwrap(); + let mut b = GenericByteDictionaryBuilder::::with_capacity(values.len(), 1024, 1024); + + // copy each element one at a time + for i in 0..values.len() { + if values.is_null(i) { + b.append_null(); + } else { + b.append(values.value(i))?; + } + } + Ok(Arc::new(b.finish())) +} diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 61bbf1280030..52eb0d367271 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -38,8 +38,10 @@ //! ``` mod decimal; +mod dictionary; mod list; use crate::cast::decimal::*; +use crate::cast::dictionary::*; use crate::cast::list::*; use chrono::{NaiveTime, Offset, TimeZone, Utc}; @@ -2323,184 +2325,6 @@ where unsafe { PrimitiveArray::::from_trusted_len_iter(iter) } } -/// Attempts to cast an `ArrayDictionary` with index type K into -/// `to_type` for supported types. -/// -/// K is the key type -fn dictionary_cast( - array: &dyn Array, - to_type: &DataType, - cast_options: &CastOptions, -) -> Result { - use DataType::*; - - match to_type { - Dictionary(to_index_type, to_value_type) => { - let dict_array = array - .as_any() - .downcast_ref::>() - .ok_or_else(|| { - ArrowError::ComputeError( - "Internal Error: Cannot cast dictionary to DictionaryArray of expected type".to_string(), - ) - })?; - - let keys_array: ArrayRef = - Arc::new(PrimitiveArray::::from(dict_array.keys().to_data())); - let values_array = dict_array.values(); - let cast_keys = cast_with_options(&keys_array, to_index_type, cast_options)?; - let cast_values = cast_with_options(values_array, to_value_type, cast_options)?; - - // Failure to cast keys (because they don't fit in the - // target type) results in NULL values; - if cast_keys.null_count() > keys_array.null_count() { - return Err(ArrowError::ComputeError(format!( - "Could not convert {} dictionary indexes from {:?} to {:?}", - cast_keys.null_count() - keys_array.null_count(), - keys_array.data_type(), - to_index_type - ))); - } - - let data = cast_keys.into_data(); - let builder = data - .into_builder() - .data_type(to_type.clone()) - .child_data(vec![cast_values.into_data()]); - - // Safety - // Cast keys are still valid - let data = unsafe { builder.build_unchecked() }; - - // create the appropriate array type - let new_array: ArrayRef = match **to_index_type { - Int8 => Arc::new(DictionaryArray::::from(data)), - Int16 => Arc::new(DictionaryArray::::from(data)), - Int32 => Arc::new(DictionaryArray::::from(data)), - Int64 => Arc::new(DictionaryArray::::from(data)), - UInt8 => Arc::new(DictionaryArray::::from(data)), - UInt16 => Arc::new(DictionaryArray::::from(data)), - UInt32 => Arc::new(DictionaryArray::::from(data)), - UInt64 => Arc::new(DictionaryArray::::from(data)), - _ => { - return Err(ArrowError::CastError(format!( - "Unsupported type {to_index_type:?} for dictionary index" - ))); - } - }; - - Ok(new_array) - } - _ => unpack_dictionary::(array, to_type, cast_options), - } -} - -// Unpack a dictionary where the keys are of type into a flattened array of type to_type -fn unpack_dictionary( - array: &dyn Array, - to_type: &DataType, - cast_options: &CastOptions, -) -> Result -where - K: ArrowDictionaryKeyType, -{ - let dict_array = array.as_dictionary::(); - let cast_dict_values = cast_with_options(dict_array.values(), to_type, cast_options)?; - take(cast_dict_values.as_ref(), dict_array.keys(), None) -} - -/// Attempts to encode an array into an `ArrayDictionary` with index -/// type K and value (dictionary) type value_type -/// -/// K is the key type -fn cast_to_dictionary( - array: &dyn Array, - dict_value_type: &DataType, - cast_options: &CastOptions, -) -> Result { - use DataType::*; - - match *dict_value_type { - Int8 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), - Int16 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), - Int32 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), - Int64 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), - UInt8 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), - UInt16 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), - UInt32 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), - UInt64 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), - Decimal128(_, _) => { - pack_numeric_to_dictionary::(array, dict_value_type, cast_options) - } - Decimal256(_, _) => { - pack_numeric_to_dictionary::(array, dict_value_type, cast_options) - } - Utf8 => pack_byte_to_dictionary::>(array, cast_options), - LargeUtf8 => pack_byte_to_dictionary::>(array, cast_options), - Binary => pack_byte_to_dictionary::>(array, cast_options), - LargeBinary => pack_byte_to_dictionary::>(array, cast_options), - _ => Err(ArrowError::CastError(format!( - "Unsupported output type for dictionary packing: {dict_value_type:?}" - ))), - } -} - -// Packs the data from the primitive array of type to a -// DictionaryArray with keys of type K and values of value_type V -fn pack_numeric_to_dictionary( - array: &dyn Array, - dict_value_type: &DataType, - cast_options: &CastOptions, -) -> Result -where - K: ArrowDictionaryKeyType, - V: ArrowPrimitiveType, -{ - // attempt to cast the source array values to the target value type (the dictionary values type) - let cast_values = cast_with_options(array, dict_value_type, cast_options)?; - let values = cast_values.as_primitive::(); - - let mut b = PrimitiveDictionaryBuilder::::with_capacity(values.len(), values.len()); - - // copy each element one at a time - for i in 0..values.len() { - if values.is_null(i) { - b.append_null(); - } else { - b.append(values.value(i))?; - } - } - Ok(Arc::new(b.finish())) -} - -// Packs the data as a GenericByteDictionaryBuilder, if possible, with the -// key types of K -fn pack_byte_to_dictionary( - array: &dyn Array, - cast_options: &CastOptions, -) -> Result -where - K: ArrowDictionaryKeyType, - T: ByteArrayType, -{ - let cast_values = cast_with_options(array, &T::DATA_TYPE, cast_options)?; - let values = cast_values - .as_any() - .downcast_ref::>() - .unwrap(); - let mut b = GenericByteDictionaryBuilder::::with_capacity(values.len(), 1024, 1024); - - // copy each element one at a time - for i in 0..values.len() { - if values.is_null(i) { - b.append_null(); - } else { - b.append(values.value(i))?; - } - } - Ok(Arc::new(b.finish())) -} - /// A specified helper to cast from `GenericBinaryArray` to `GenericStringArray` when they have same /// offset size so re-encoding offset is unnecessary. fn cast_binary_to_string(