From 2e30bc70fec58676e0781a708d3d687425e4f27e Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 12 Dec 2024 18:30:31 +0000 Subject: [PATCH 1/7] add `extend_dictionary` in dictionary builder for improved performance --- .../generic_bytes_dictionary_builder.rs | 67 ++++++++++++++++- .../builder/primitive_dictionary_builder.rs | 71 ++++++++++++++++++- 2 files changed, 134 insertions(+), 4 deletions(-) diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs index bb0fb5e91be2..6812aa3e329f 100644 --- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs @@ -17,7 +17,7 @@ use crate::builder::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder}; use crate::types::{ArrowDictionaryKeyType, ByteArrayType, GenericBinaryType, GenericStringType}; -use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray}; +use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray, TypedDictionaryArray}; use arrow_buffer::ArrowNativeType; use arrow_schema::{ArrowError, DataType}; use hashbrown::HashTable; @@ -305,6 +305,41 @@ where }; } + /// Extends builder with dictionary + /// + /// This is the same as `extends` but avoid lookup for each item in the iterator + /// + pub fn extend_dictionary(&mut self, dictionary: &TypedDictionaryArray>) -> Result<(), ArrowError> { + let values = dictionary.values(); + + let v_len = values.len(); + if v_len == 0 { + return Ok(()); + } + + // Orphan values will be carried over to the new dictionary + let mapped_values = values.iter() + // Dictionary values should not be null as the keys are null if the value is null + .map(|dict_value| self.get_or_insert_key(dict_value).expect("Dictionary value should not be null")) + .collect::, _>>()?; + + // Just insert the keys without additional lookups + dictionary + .keys() + .iter() + .for_each(|key| { + match key { + None => self.append_null(), + Some(original_dict_index) => { + let index = original_dict_index.as_usize().min(v_len - 1); + self.keys_builder.append_value(mapped_values[index]); + } + } + }); + + Ok(()) + } + /// Builds the `DictionaryArray` and reset this builder. pub fn finish(&mut self) -> DictionaryArray { self.dedup.clear(); @@ -664,4 +699,34 @@ mod tests { assert_eq!(dict.keys().values(), &[0, 1, 2, 0, 1, 2, 2, 3, 0]); assert_eq!(dict.values().len(), 4); } + + #[test] + fn test_extend_dictionary() { + let some_dict = { + let mut builder = GenericByteDictionaryBuilder::::new(); + builder.extend(["a", "b", "c", "a", "b", "c"].into_iter().map(Some)); + builder.extend([None::<&str>].into_iter()); + builder.extend(["c", "d", "a"].into_iter().map(Some)); + builder.append_null(); + builder.finish() + }; + + let mut builder = GenericByteDictionaryBuilder::::new(); + builder.extend(["e", "e", "f", "e", "d"].into_iter().map(Some)); + builder.extend_dictionary(&some_dict.downcast_dict().unwrap()).unwrap(); + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 6); + + let values = dict + .downcast_dict::>() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!( + values, + [Some("e"), Some("e"), Some("f"), Some("e"), Some("d"), Some("a"), Some("b"), Some("c"), Some("a"), Some("b"), Some("c"), None, Some("c"), Some("d"), Some("a"), None] + ); + } } diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index ac40f8a469d3..0eb6c8205b5f 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -17,7 +17,7 @@ use crate::builder::{ArrayBuilder, PrimitiveBuilder}; use crate::types::ArrowDictionaryKeyType; -use crate::{Array, ArrayRef, ArrowPrimitiveType, DictionaryArray}; +use crate::{Array, ArrayRef, ArrowPrimitiveType, DictionaryArray, PrimitiveArray, TypedDictionaryArray}; use arrow_buffer::{ArrowNativeType, ToByteSlice}; use arrow_schema::{ArrowError, DataType}; use std::any::Any; @@ -303,6 +303,41 @@ where }; } + /// Extends builder with dictionary + /// + /// This is the same as `extends` but avoid lookup for each item in the iterator + /// + pub fn extend_dictionary(&mut self, dictionary: &TypedDictionaryArray>) -> Result<(), ArrowError> { + let values = dictionary.values(); + + let v_len = values.len(); + if v_len == 0 { + return Ok(()); + } + + // Orphan values will be carried over to the new dictionary + let mapped_values = values.iter() + // Dictionary values should not be null as the keys are null if the value is null + .map(|dict_value| self.get_or_insert_key(dict_value.expect("Dictionary value should not be null"))) + .collect::, _>>()?; + + // Just insert the keys without additional lookups + dictionary + .keys() + .iter() + .for_each(|key| { + match key { + None => self.append_null(), + Some(original_dict_index) => { + let index = original_dict_index.as_usize().min(v_len - 1); + self.keys_builder.append_value(mapped_values[index]); + } + } + }); + + Ok(()) + } + /// Builds the `DictionaryArray` and reset this builder. pub fn finish(&mut self) -> DictionaryArray { self.map.clear(); @@ -368,8 +403,7 @@ impl Extend> mod tests { use super::*; - use crate::array::UInt32Array; - use crate::array::UInt8Array; + use crate::array::{UInt32Array, UInt8Array, Int32Array}; use crate::builder::Decimal128Builder; use crate::types::{Decimal128Type, Int32Type, UInt32Type, UInt8Type}; @@ -443,4 +477,35 @@ mod tests { ) ); } + + + #[test] + fn test_extend_dictionary() { + let some_dict = { + let mut builder = PrimitiveDictionaryBuilder::::new(); + builder.extend([1, 2, 3, 1, 2, 3, 1, 2, 3].into_iter().map(Some)); + builder.extend([None::].into_iter()); + builder.extend([4, 5, 1, 3, 1].into_iter().map(Some)); + builder.append_null(); + builder.finish() + }; + + let mut builder = PrimitiveDictionaryBuilder::::new(); + builder.extend([6, 6, 7, 6, 5].into_iter().map(Some)); + builder.extend_dictionary(&some_dict.downcast_dict().unwrap()).unwrap(); + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 7); + + let values = dict + .downcast_dict::() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!( + values, + [Some(6), Some(6), Some(7), Some(6), Some(5), Some(1), Some(2), Some(3), Some(1), Some(2), Some(3), Some(1), Some(2), Some(3), None, Some(4), Some(5), Some(1), Some(3), Some(1), None] + ); + } } From 5512218e572d7830ff02304b717dddce8d7e924b Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 12 Dec 2024 19:22:20 +0000 Subject: [PATCH 2/7] fix extends all nulls --- .../generic_bytes_dictionary_builder.rs | 41 ++++++++++++++++++- .../builder/primitive_dictionary_builder.rs | 37 +++++++++++++++++ 2 files changed, 76 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs index 6812aa3e329f..1a86e0405231 100644 --- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs @@ -311,16 +311,27 @@ where /// pub fn extend_dictionary(&mut self, dictionary: &TypedDictionaryArray>) -> Result<(), ArrowError> { let values = dictionary.values(); - + let v_len = values.len(); + let k_len = dictionary.keys().len(); + if v_len == 0 && k_len == 0 { + return Ok(()); + } + + // All nulls if v_len == 0 { + self.append_nulls(k_len); return Ok(()); } + if k_len == 0 { + return Err(ArrowError::InvalidArgumentError("Dictionary keys should not be empty when values are not empty".to_string())); + } + // Orphan values will be carried over to the new dictionary let mapped_values = values.iter() // Dictionary values should not be null as the keys are null if the value is null - .map(|dict_value| self.get_or_insert_key(dict_value).expect("Dictionary value should not be null")) + .map(|dict_value| self.get_or_insert_key(dict_value.expect("Dictionary value should not be null"))) .collect::, _>>()?; // Just insert the keys without additional lookups @@ -729,4 +740,30 @@ mod tests { [Some("e"), Some("e"), Some("f"), Some("e"), Some("d"), Some("a"), Some("b"), Some("c"), Some("a"), Some("b"), Some("c"), None, Some("c"), Some("d"), Some("a"), None] ); } + + #[test] + fn test_extend_all_null_dictionary() { + let some_dict = { + let mut builder = GenericByteDictionaryBuilder::::new(); + builder.append_nulls(2); + builder.finish() + }; + + let mut builder = GenericByteDictionaryBuilder::::new(); + builder.extend_dictionary(&some_dict.downcast_dict().unwrap()).unwrap(); + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 0); + + let values = dict + .downcast_dict::>() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!( + values, + [None, None] + ); + } } diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index 0eb6c8205b5f..1f1701aa8cd6 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -311,10 +311,21 @@ where let values = dictionary.values(); let v_len = values.len(); + let k_len = dictionary.keys().len(); + if v_len == 0 && k_len == 0 { + return Ok(()); + } + + // All nulls if v_len == 0 { + self.append_nulls(k_len); return Ok(()); } + if k_len == 0 { + return Err(ArrowError::InvalidArgumentError("Dictionary keys should not be empty when values are not empty".to_string())); + } + // Orphan values will be carried over to the new dictionary let mapped_values = values.iter() // Dictionary values should not be null as the keys are null if the value is null @@ -508,4 +519,30 @@ mod tests { [Some(6), Some(6), Some(7), Some(6), Some(5), Some(1), Some(2), Some(3), Some(1), Some(2), Some(3), Some(1), Some(2), Some(3), None, Some(4), Some(5), Some(1), Some(3), Some(1), None] ); } + + #[test] + fn test_extend_all_null_dictionary() { + let some_dict = { + let mut builder = PrimitiveDictionaryBuilder::::new(); + builder.append_nulls(2); + builder.finish() + }; + + let mut builder = PrimitiveDictionaryBuilder::::new(); + builder.extend_dictionary(&some_dict.downcast_dict().unwrap()).unwrap(); + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 0); + + let values = dict + .downcast_dict::() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!( + values, + [None, None] + ); + } } From e1b8dc954c95189be1c95e2f695d184ef1a51a0d Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 12 Dec 2024 23:58:46 +0200 Subject: [PATCH 3/7] support null in mapped value --- .../generic_bytes_dictionary_builder.rs | 60 +++++++++++++++-- .../builder/primitive_dictionary_builder.rs | 65 +++++++++++++++++-- 2 files changed, 116 insertions(+), 9 deletions(-) diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs index 1a86e0405231..8d90339642ad 100644 --- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs @@ -330,8 +330,10 @@ where // Orphan values will be carried over to the new dictionary let mapped_values = values.iter() - // Dictionary values should not be null as the keys are null if the value is null - .map(|dict_value| self.get_or_insert_key(dict_value.expect("Dictionary value should not be null"))) + // Dictionary values can technically be null, so we need to handle that + .map(|dict_value| dict_value + .map(|dict_value| self.get_or_insert_key(dict_value)) + .transpose()) .collect::, _>>()?; // Just insert the keys without additional lookups @@ -343,7 +345,10 @@ where None => self.append_null(), Some(original_dict_index) => { let index = original_dict_index.as_usize().min(v_len - 1); - self.keys_builder.append_value(mapped_values[index]); + match mapped_values[index] { + None => self.append_null(), + Some(mapped_value) => self.keys_builder.append_value(mapped_value), + } } } }); @@ -491,8 +496,9 @@ mod tests { use super::*; use crate::array::Int8Array; + use crate::cast::AsArray; use crate::types::{Int16Type, Int32Type, Int8Type, Utf8Type}; - use crate::{BinaryArray, StringArray}; + use crate::{ArrowPrimitiveType, BinaryArray, StringArray}; fn test_bytes_dictionary_builder(values: Vec<&T::Native>) where @@ -740,6 +746,52 @@ mod tests { [Some("e"), Some("e"), Some("f"), Some("e"), Some("d"), Some("a"), Some("b"), Some("c"), Some("a"), Some("b"), Some("c"), None, Some("c"), Some("d"), Some("a"), None] ); } + #[test] + fn test_extend_dictionary_with_null_in_mapped_value() { + let some_dict = { + let mut values_builder = GenericByteBuilder::::new(); + let mut keys_builder = PrimitiveBuilder::::new(); + + // Manually build a dictionary values that the mapped values have null + values_builder.append_null(); + keys_builder.append_value(0); + values_builder.append_value("I like worm hugs"); + keys_builder.append_value(1); + + let values = values_builder.finish(); + let keys = keys_builder.finish(); + + let data_type = DataType::Dictionary(Box::new(Int32Type::DATA_TYPE), Box::new(Utf8Type::DATA_TYPE)); + + let builder = keys + .into_data() + .into_builder() + .data_type(data_type) + .child_data(vec![values.into_data()]); + + DictionaryArray::from(unsafe { builder.build_unchecked() }) + }; + + let some_dict_values = some_dict.values().as_string::(); + assert_eq!(some_dict_values.into_iter().collect::>(), &[None, Some("I like worm hugs")]); + + let mut builder = GenericByteDictionaryBuilder::::new(); + builder.extend_dictionary(&some_dict.downcast_dict().unwrap()).unwrap(); + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 1); + + let values = dict + .downcast_dict::>() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!( + values, + [None, Some("I like worm hugs")] + ); + } #[test] fn test_extend_all_null_dictionary() { diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index 1f1701aa8cd6..ac9fa478bfe7 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -305,7 +305,9 @@ where /// Extends builder with dictionary /// - /// This is the same as `extends` but avoid lookup for each item in the iterator + /// This is the same as `extends` but avoid lookup for each item in the iterator + /// + /// when dictionary values are null (the actual mapped values) the keys are null /// pub fn extend_dictionary(&mut self, dictionary: &TypedDictionaryArray>) -> Result<(), ArrowError> { let values = dictionary.values(); @@ -328,8 +330,10 @@ where // Orphan values will be carried over to the new dictionary let mapped_values = values.iter() - // Dictionary values should not be null as the keys are null if the value is null - .map(|dict_value| self.get_or_insert_key(dict_value.expect("Dictionary value should not be null"))) + // Dictionary values can technically be null, so we need to handle that + .map(|dict_value| dict_value + .map(|dict_value| self.get_or_insert_key(dict_value)) + .transpose()) .collect::, _>>()?; // Just insert the keys without additional lookups @@ -341,7 +345,10 @@ where None => self.append_null(), Some(original_dict_index) => { let index = original_dict_index.as_usize().min(v_len - 1); - self.keys_builder.append_value(mapped_values[index]); + match mapped_values[index] { + None => self.append_null(), + Some(mapped_value) => self.keys_builder.append_value(mapped_value), + } } } }); @@ -416,6 +423,7 @@ mod tests { use crate::array::{UInt32Array, UInt8Array, Int32Array}; use crate::builder::Decimal128Builder; + use crate::cast::AsArray; use crate::types::{Decimal128Type, Int32Type, UInt32Type, UInt8Type}; #[test] @@ -489,7 +497,6 @@ mod tests { ); } - #[test] fn test_extend_dictionary() { let some_dict = { @@ -520,6 +527,54 @@ mod tests { ); } + #[test] + fn test_extend_dictionary_with_null_in_mapped_value() { + let some_dict = { + let mut values_builder = PrimitiveBuilder::::new(); + let mut keys_builder = PrimitiveBuilder::::new(); + + // Manually build a dictionary values that the mapped values have null + values_builder.append_null(); + keys_builder.append_value(0); + values_builder.append_value(42); + keys_builder.append_value(1); + + let values = values_builder.finish(); + let keys = keys_builder.finish(); + + let data_type = + DataType::Dictionary(Box::new(Int32Type::DATA_TYPE), Box::new(values.data_type().clone())); + + let builder = keys + .into_data() + .into_builder() + .data_type(data_type) + .child_data(vec![values.into_data()]); + + DictionaryArray::from(unsafe { builder.build_unchecked() }) + }; + + let some_dict_values = some_dict.values().as_primitive::(); + assert_eq!(some_dict_values.into_iter().collect::>(), &[None, Some(42)]); + + let mut builder = PrimitiveDictionaryBuilder::::new(); + builder.extend_dictionary(&some_dict.downcast_dict().unwrap()).unwrap(); + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 1); + + let values = dict + .downcast_dict::() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!( + values, + [None, Some(42)] + ); + } + #[test] fn test_extend_all_null_dictionary() { let some_dict = { From dbd1fdd96290e5aa6453b558ba73d818f159523f Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Fri, 13 Dec 2024 00:00:06 +0200 Subject: [PATCH 4/7] adding comment --- arrow-array/src/builder/generic_bytes_dictionary_builder.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs index 8d90339642ad..6bc4c0a48d35 100644 --- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs @@ -307,7 +307,9 @@ where /// Extends builder with dictionary /// - /// This is the same as `extends` but avoid lookup for each item in the iterator + /// This is the same as `extends` but avoid lookup for each item in the iterator + /// + /// when dictionary values are null (the actual mapped values) the keys are null /// pub fn extend_dictionary(&mut self, dictionary: &TypedDictionaryArray>) -> Result<(), ArrowError> { let values = dictionary.values(); From a09615af53bfd8d0d78c73a0b45b2b12708ed7d2 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Fri, 13 Dec 2024 00:01:04 +0200 Subject: [PATCH 5/7] run `clippy` and `fmt` --- .../generic_bytes_dictionary_builder.rs | 106 +++++++++++------- .../builder/primitive_dictionary_builder.rs | 104 +++++++++++------ 2 files changed, 134 insertions(+), 76 deletions(-) diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs index 6bc4c0a48d35..54034c847c7c 100644 --- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs @@ -311,9 +311,12 @@ where /// /// when dictionary values are null (the actual mapped values) the keys are null /// - pub fn extend_dictionary(&mut self, dictionary: &TypedDictionaryArray>) -> Result<(), ArrowError> { + pub fn extend_dictionary( + &mut self, + dictionary: &TypedDictionaryArray>, + ) -> Result<(), ArrowError> { let values = dictionary.values(); - + let v_len = values.len(); let k_len = dictionary.keys().len(); if v_len == 0 && k_len == 0 { @@ -327,33 +330,33 @@ where } if k_len == 0 { - return Err(ArrowError::InvalidArgumentError("Dictionary keys should not be empty when values are not empty".to_string())); + return Err(ArrowError::InvalidArgumentError( + "Dictionary keys should not be empty when values are not empty".to_string(), + )); } // Orphan values will be carried over to the new dictionary - let mapped_values = values.iter() + let mapped_values = values + .iter() // Dictionary values can technically be null, so we need to handle that - .map(|dict_value| dict_value - .map(|dict_value| self.get_or_insert_key(dict_value)) - .transpose()) + .map(|dict_value| { + dict_value + .map(|dict_value| self.get_or_insert_key(dict_value)) + .transpose() + }) .collect::, _>>()?; // Just insert the keys without additional lookups - dictionary - .keys() - .iter() - .for_each(|key| { - match key { + dictionary.keys().iter().for_each(|key| match key { + None => self.append_null(), + Some(original_dict_index) => { + let index = original_dict_index.as_usize().min(v_len - 1); + match mapped_values[index] { None => self.append_null(), - Some(original_dict_index) => { - let index = original_dict_index.as_usize().min(v_len - 1); - match mapped_values[index] { - None => self.append_null(), - Some(mapped_value) => self.keys_builder.append_value(mapped_value), - } - } + Some(mapped_value) => self.keys_builder.append_value(mapped_value), } - }); + } + }); Ok(()) } @@ -732,20 +735,39 @@ mod tests { let mut builder = GenericByteDictionaryBuilder::::new(); builder.extend(["e", "e", "f", "e", "d"].into_iter().map(Some)); - builder.extend_dictionary(&some_dict.downcast_dict().unwrap()).unwrap(); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); let dict = builder.finish(); - + assert_eq!(dict.values().len(), 6); - + let values = dict .downcast_dict::>() .unwrap() .into_iter() .collect::>(); - + assert_eq!( values, - [Some("e"), Some("e"), Some("f"), Some("e"), Some("d"), Some("a"), Some("b"), Some("c"), Some("a"), Some("b"), Some("c"), None, Some("c"), Some("d"), Some("a"), None] + [ + Some("e"), + Some("e"), + Some("f"), + Some("e"), + Some("d"), + Some("a"), + Some("b"), + Some("c"), + Some("a"), + Some("b"), + Some("c"), + None, + Some("c"), + Some("d"), + Some("a"), + None + ] ); } #[test] @@ -763,7 +785,10 @@ mod tests { let values = values_builder.finish(); let keys = keys_builder.finish(); - let data_type = DataType::Dictionary(Box::new(Int32Type::DATA_TYPE), Box::new(Utf8Type::DATA_TYPE)); + let data_type = DataType::Dictionary( + Box::new(Int32Type::DATA_TYPE), + Box::new(Utf8Type::DATA_TYPE), + ); let builder = keys .into_data() @@ -775,10 +800,15 @@ mod tests { }; let some_dict_values = some_dict.values().as_string::(); - assert_eq!(some_dict_values.into_iter().collect::>(), &[None, Some("I like worm hugs")]); + assert_eq!( + some_dict_values.into_iter().collect::>(), + &[None, Some("I like worm hugs")] + ); let mut builder = GenericByteDictionaryBuilder::::new(); - builder.extend_dictionary(&some_dict.downcast_dict().unwrap()).unwrap(); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); let dict = builder.finish(); assert_eq!(dict.values().len(), 1); @@ -789,10 +819,7 @@ mod tests { .into_iter() .collect::>(); - assert_eq!( - values, - [None, Some("I like worm hugs")] - ); + assert_eq!(values, [None, Some("I like worm hugs")]); } #[test] @@ -804,20 +831,19 @@ mod tests { }; let mut builder = GenericByteDictionaryBuilder::::new(); - builder.extend_dictionary(&some_dict.downcast_dict().unwrap()).unwrap(); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); let dict = builder.finish(); - + assert_eq!(dict.values().len(), 0); - + let values = dict .downcast_dict::>() .unwrap() .into_iter() .collect::>(); - - assert_eq!( - values, - [None, None] - ); + + assert_eq!(values, [None, None]); } } diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index ac9fa478bfe7..90f6fcfdc866 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -17,7 +17,9 @@ use crate::builder::{ArrayBuilder, PrimitiveBuilder}; use crate::types::ArrowDictionaryKeyType; -use crate::{Array, ArrayRef, ArrowPrimitiveType, DictionaryArray, PrimitiveArray, TypedDictionaryArray}; +use crate::{ + Array, ArrayRef, ArrowPrimitiveType, DictionaryArray, PrimitiveArray, TypedDictionaryArray, +}; use arrow_buffer::{ArrowNativeType, ToByteSlice}; use arrow_schema::{ArrowError, DataType}; use std::any::Any; @@ -309,7 +311,10 @@ where /// /// when dictionary values are null (the actual mapped values) the keys are null /// - pub fn extend_dictionary(&mut self, dictionary: &TypedDictionaryArray>) -> Result<(), ArrowError> { + pub fn extend_dictionary( + &mut self, + dictionary: &TypedDictionaryArray>, + ) -> Result<(), ArrowError> { let values = dictionary.values(); let v_len = values.len(); @@ -325,33 +330,33 @@ where } if k_len == 0 { - return Err(ArrowError::InvalidArgumentError("Dictionary keys should not be empty when values are not empty".to_string())); + return Err(ArrowError::InvalidArgumentError( + "Dictionary keys should not be empty when values are not empty".to_string(), + )); } // Orphan values will be carried over to the new dictionary - let mapped_values = values.iter() + let mapped_values = values + .iter() // Dictionary values can technically be null, so we need to handle that - .map(|dict_value| dict_value - .map(|dict_value| self.get_or_insert_key(dict_value)) - .transpose()) + .map(|dict_value| { + dict_value + .map(|dict_value| self.get_or_insert_key(dict_value)) + .transpose() + }) .collect::, _>>()?; // Just insert the keys without additional lookups - dictionary - .keys() - .iter() - .for_each(|key| { - match key { + dictionary.keys().iter().for_each(|key| match key { + None => self.append_null(), + Some(original_dict_index) => { + let index = original_dict_index.as_usize().min(v_len - 1); + match mapped_values[index] { None => self.append_null(), - Some(original_dict_index) => { - let index = original_dict_index.as_usize().min(v_len - 1); - match mapped_values[index] { - None => self.append_null(), - Some(mapped_value) => self.keys_builder.append_value(mapped_value), - } - } + Some(mapped_value) => self.keys_builder.append_value(mapped_value), } - }); + } + }); Ok(()) } @@ -421,7 +426,7 @@ impl Extend> mod tests { use super::*; - use crate::array::{UInt32Array, UInt8Array, Int32Array}; + use crate::array::{Int32Array, UInt32Array, UInt8Array}; use crate::builder::Decimal128Builder; use crate::cast::AsArray; use crate::types::{Decimal128Type, Int32Type, UInt32Type, UInt8Type}; @@ -510,7 +515,9 @@ mod tests { let mut builder = PrimitiveDictionaryBuilder::::new(); builder.extend([6, 6, 7, 6, 5].into_iter().map(Some)); - builder.extend_dictionary(&some_dict.downcast_dict().unwrap()).unwrap(); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); let dict = builder.finish(); assert_eq!(dict.values().len(), 7); @@ -523,7 +530,29 @@ mod tests { assert_eq!( values, - [Some(6), Some(6), Some(7), Some(6), Some(5), Some(1), Some(2), Some(3), Some(1), Some(2), Some(3), Some(1), Some(2), Some(3), None, Some(4), Some(5), Some(1), Some(3), Some(1), None] + [ + Some(6), + Some(6), + Some(7), + Some(6), + Some(5), + Some(1), + Some(2), + Some(3), + Some(1), + Some(2), + Some(3), + Some(1), + Some(2), + Some(3), + None, + Some(4), + Some(5), + Some(1), + Some(3), + Some(1), + None + ] ); } @@ -542,8 +571,10 @@ mod tests { let values = values_builder.finish(); let keys = keys_builder.finish(); - let data_type = - DataType::Dictionary(Box::new(Int32Type::DATA_TYPE), Box::new(values.data_type().clone())); + let data_type = DataType::Dictionary( + Box::new(Int32Type::DATA_TYPE), + Box::new(values.data_type().clone()), + ); let builder = keys .into_data() @@ -555,10 +586,15 @@ mod tests { }; let some_dict_values = some_dict.values().as_primitive::(); - assert_eq!(some_dict_values.into_iter().collect::>(), &[None, Some(42)]); + assert_eq!( + some_dict_values.into_iter().collect::>(), + &[None, Some(42)] + ); let mut builder = PrimitiveDictionaryBuilder::::new(); - builder.extend_dictionary(&some_dict.downcast_dict().unwrap()).unwrap(); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); let dict = builder.finish(); assert_eq!(dict.values().len(), 1); @@ -569,10 +605,7 @@ mod tests { .into_iter() .collect::>(); - assert_eq!( - values, - [None, Some(42)] - ); + assert_eq!(values, [None, Some(42)]); } #[test] @@ -584,7 +617,9 @@ mod tests { }; let mut builder = PrimitiveDictionaryBuilder::::new(); - builder.extend_dictionary(&some_dict.downcast_dict().unwrap()).unwrap(); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); let dict = builder.finish(); assert_eq!(dict.values().len(), 0); @@ -595,9 +630,6 @@ mod tests { .into_iter() .collect::>(); - assert_eq!( - values, - [None, None] - ); + assert_eq!(values, [None, None]); } } From d86dcc9060b12bc03f527801767ed809dc225fe8 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Tue, 17 Dec 2024 19:53:00 +0000 Subject: [PATCH 6/7] fix ci --- arrow-array/src/builder/generic_bytes_dictionary_builder.rs | 2 +- arrow-array/src/builder/primitive_dictionary_builder.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs index 54034c847c7c..4a661063ef94 100644 --- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs @@ -727,7 +727,7 @@ mod tests { let some_dict = { let mut builder = GenericByteDictionaryBuilder::::new(); builder.extend(["a", "b", "c", "a", "b", "c"].into_iter().map(Some)); - builder.extend([None::<&str>].into_iter()); + builder.extend([None::<&str>]); builder.extend(["c", "d", "a"].into_iter().map(Some)); builder.append_null(); builder.finish() diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index 90f6fcfdc866..00498e46a1b1 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -46,7 +46,7 @@ impl PartialEq for Value { impl Eq for Value {} -/// Builder for [`DictionaryArray`] of [`PrimitiveArray`](crate::array::PrimitiveArray) +/// Builder for [`DictionaryArray`] of [`PrimitiveArray`] /// /// # Example: /// @@ -507,7 +507,7 @@ mod tests { let some_dict = { let mut builder = PrimitiveDictionaryBuilder::::new(); builder.extend([1, 2, 3, 1, 2, 3, 1, 2, 3].into_iter().map(Some)); - builder.extend([None::].into_iter()); + builder.extend([None::]); builder.extend([4, 5, 1, 3, 1].into_iter().map(Some)); builder.append_null(); builder.finish() From 5d48eda90c68ba89fb8e74fbee88ed9de63e3b22 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Wed, 18 Dec 2024 18:17:06 +0200 Subject: [PATCH 7/7] Apply suggestions from code review Co-authored-by: Andrew Lamb --- arrow-array/src/builder/generic_bytes_dictionary_builder.rs | 5 +++-- arrow-array/src/builder/primitive_dictionary_builder.rs | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs index 4a661063ef94..ead151d5ceea 100644 --- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs @@ -305,9 +305,10 @@ where }; } - /// Extends builder with dictionary + /// Extends builder with an existing dictionary array. /// - /// This is the same as `extends` but avoid lookup for each item in the iterator + /// This is the same as [`Self::extend`] but is faster as it translates + /// the dictionary values once rather than doing a lookup for each item in the iterator /// /// when dictionary values are null (the actual mapped values) the keys are null /// diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index 00498e46a1b1..282f0ae9d5b1 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -307,7 +307,8 @@ where /// Extends builder with dictionary /// - /// This is the same as `extends` but avoid lookup for each item in the iterator + /// This is the same as [`Self::extend`] but is faster as it translates + /// the dictionary values once rather than doing a lookup for each item in the iterator /// /// when dictionary values are null (the actual mapped values) the keys are null ///