Skip to content

Commit

Permalink
arrow-array::builder: support more dictionary keys
Browse files Browse the repository at this point in the history
The spec says that the keys in dictionaries are [0]:

> (4) The index type of a Dictionary type can only be an integer type, preferably signed, with width 8 to 64 bits.

In my use case I have a very small number of values so wasting bits on a
wider key is wasteful.

[0]: https://github.com/apache/arrow/blob/fe32a7dfe5e22e7737198476fe1ac0e8a5dccef2/docs/source/format/Columnar.rst?plain=1#L182-L183
  • Loading branch information
ajwerner committed Dec 6, 2024
1 parent e7588bd commit 91f5725
Showing 1 changed file with 76 additions and 33 deletions.
109 changes: 76 additions & 33 deletions arrow-array/src/builder/struct_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@
// specific language governing permissions and limitations
// under the License.

use crate::builder::*;
use crate::types::Int32Type;
use crate::StructArray;
use crate::{
builder::*,
types::{Int16Type, Int32Type, Int64Type, Int8Type},
};
use arrow_buffer::NullBufferBuilder;
use arrow_schema::{DataType, Fields, IntervalUnit, SchemaBuilder, TimeUnit};
use std::sync::Arc;
Expand Down Expand Up @@ -291,29 +293,42 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box<dyn ArrayBuilde
t => panic!("The field of Map data type {t:?} should has a child Struct field"),
},
DataType::Struct(fields) => Box::new(StructBuilder::from_fields(fields.clone(), capacity)),
DataType::Dictionary(key_type, value_type) if **key_type == DataType::Int32 => {
match &**value_type {
DataType::Utf8 => {
let dict_builder: StringDictionaryBuilder<Int32Type> =
StringDictionaryBuilder::with_capacity(capacity, 256, 1024);
Box::new(dict_builder)
}
DataType::LargeUtf8 => {
let dict_builder: LargeStringDictionaryBuilder<Int32Type> =
LargeStringDictionaryBuilder::with_capacity(capacity, 256, 1024);
Box::new(dict_builder)
}
DataType::Binary => {
let dict_builder: BinaryDictionaryBuilder<Int32Type> =
BinaryDictionaryBuilder::with_capacity(capacity, 256, 1024);
Box::new(dict_builder)
}
DataType::LargeBinary => {
let dict_builder: LargeBinaryDictionaryBuilder<Int32Type> =
LargeBinaryDictionaryBuilder::with_capacity(capacity, 256, 1024);
Box::new(dict_builder)
t @ DataType::Dictionary(key_type, value_type) => {
macro_rules! dict_builder {
($key_type:ty) => {
match &**value_type {
DataType::Utf8 => {
let dict_builder: StringDictionaryBuilder<$key_type> =
StringDictionaryBuilder::with_capacity(capacity, 256, 1024);
Box::new(dict_builder)
}
DataType::LargeUtf8 => {
let dict_builder: LargeStringDictionaryBuilder<$key_type> =
LargeStringDictionaryBuilder::with_capacity(capacity, 256, 1024);
Box::new(dict_builder)
}
DataType::Binary => {
let dict_builder: BinaryDictionaryBuilder<$key_type> =
BinaryDictionaryBuilder::with_capacity(capacity, 256, 1024);
Box::new(dict_builder)
}
DataType::LargeBinary => {
let dict_builder: LargeBinaryDictionaryBuilder<$key_type> =
LargeBinaryDictionaryBuilder::with_capacity(capacity, 256, 1024);
Box::new(dict_builder)
}
t => panic!("Dictionary value type {t:?} is not currently supported"),
}
};
}
match &**key_type {
DataType::Int8 => dict_builder!(Int8Type),
DataType::Int16 => dict_builder!(Int16Type),
DataType::Int32 => dict_builder!(Int32Type),
DataType::Int64 => dict_builder!(Int64Type),
_ => {
panic!("Data type {t:?} with key type {key_type:?} is not currently supported")
}
t => panic!("Unsupported dictionary value type {t:?} is not currently supported"),
}
}
t => panic!("Data type {t:?} is not currently supported"),
Expand Down Expand Up @@ -431,12 +446,14 @@ impl StructBuilder {

#[cfg(test)]
mod tests {
use std::any::type_name;

use super::*;
use arrow_buffer::Buffer;
use arrow_data::ArrayData;
use arrow_schema::Field;

use crate::array::Array;
use crate::{array::Array, types::ArrowDictionaryKeyType};

#[test]
fn test_struct_array_builder() {
Expand Down Expand Up @@ -691,10 +708,31 @@ mod tests {
}

#[test]
fn test_struct_array_builder_from_dictionary_type() {
fn test_struct_array_builder_from_dictionary_type_int8_key() {
test_struct_array_builder_from_dictionary_type_inner::<Int8Type>(DataType::Int8);
}

#[test]
fn test_struct_array_builder_from_dictionary_type_int16_key() {
test_struct_array_builder_from_dictionary_type_inner::<Int16Type>(DataType::Int16);
}

#[test]
fn test_struct_array_builder_from_dictionary_type_int32_key() {
test_struct_array_builder_from_dictionary_type_inner::<Int32Type>(DataType::Int32);
}

#[test]
fn test_struct_array_builder_from_dictionary_type_int64_key() {
test_struct_array_builder_from_dictionary_type_inner::<Int64Type>(DataType::Int64);
}

fn test_struct_array_builder_from_dictionary_type_inner<K: ArrowDictionaryKeyType>(
key_type: DataType,
) {
let dict_field = Field::new(
"f1",
DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
DataType::Dictionary(Box::new(key_type), Box::new(DataType::Utf8)),
false,
);
let fields = vec![dict_field.clone()];
Expand All @@ -703,8 +741,11 @@ mod tests {
let expected_child_dtype = dict_field.data_type();
let mut struct_builder = StructBuilder::from_fields(vec![cloned_dict_field], 5);
struct_builder
.field_builder::<StringDictionaryBuilder<Int32Type>>(0)
.expect("Builder should be StringDictionaryBuilder")
.field_builder::<StringDictionaryBuilder<K>>(0)
.expect(&format!(
"Builder should be StringDictionaryBuilder<{}>",
type_name::<K>()
))
.append_value("dict string");
struct_builder.append(true);
let array = struct_builder.finish();
Expand All @@ -715,13 +756,15 @@ mod tests {
}

#[test]
#[should_panic(expected = "Data type Dictionary(Int16, Utf8) is not currently supported")]
#[should_panic(
expected = "Data type Dictionary(UInt64, Utf8) with key type UInt64 is not currently supported"
)]
fn test_struct_array_builder_from_schema_unsupported_type() {
let fields = vec![
Field::new("f1", DataType::Int16, false),
Field::new("f1", DataType::UInt64, false),
Field::new(
"f2",
DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)),
DataType::Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)),
false,
),
];
Expand All @@ -730,7 +773,7 @@ mod tests {
}

#[test]
#[should_panic(expected = "Unsupported dictionary value type Int32 is not currently supported")]
#[should_panic(expected = "Dictionary value type Int32 is not currently supported")]
fn test_struct_array_builder_from_dict_with_unsupported_value_type() {
let fields = vec![Field::new(
"f1",
Expand Down

0 comments on commit 91f5725

Please sign in to comment.