-
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Allow decoding of non-Polars arrow dictionaries in Arrow and Pa…
…rquet (#20248)
- Loading branch information
1 parent
028297a
commit 6d0f5df
Showing
52 changed files
with
1,104 additions
and
1,150 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
use arrow::array::{Array, BinaryViewArray, PrimitiveArray, Utf8ViewArray}; | ||
use arrow::bitmap::Bitmap; | ||
use arrow::datatypes::ArrowDataType::UInt32; | ||
|
||
/// Propagate the nulls from the dictionary values into the keys and remove those nulls from the | ||
/// values. | ||
pub fn propagate_dictionary_value_nulls( | ||
keys: &PrimitiveArray<u32>, | ||
values: &Utf8ViewArray, | ||
) -> (PrimitiveArray<u32>, Utf8ViewArray) { | ||
let Some(values_validity) = values.validity() else { | ||
return (keys.clone(), values.clone().with_validity(None)); | ||
}; | ||
if values_validity.unset_bits() == 0 { | ||
return (keys.clone(), values.clone().with_validity(None)); | ||
} | ||
|
||
let num_values = values.len(); | ||
|
||
// Create a map from the old indices to indices with nulls filtered out | ||
let mut offset = 0; | ||
let new_idx_map: Vec<u32> = (0..num_values) | ||
.map(|i| { | ||
let is_valid = unsafe { values_validity.get_bit_unchecked(i) }; | ||
offset += usize::from(!is_valid); | ||
if is_valid { | ||
(i - offset) as u32 | ||
} else { | ||
0 | ||
} | ||
}) | ||
.collect(); | ||
|
||
let keys = match keys.validity() { | ||
None => { | ||
let values = keys | ||
.values() | ||
.iter() | ||
.map(|&k| unsafe { | ||
// SAFETY: Arrow invariant that all keys are in range of values | ||
*new_idx_map.get_unchecked(k as usize) | ||
}) | ||
.collect(); | ||
let validity = Bitmap::from_iter(keys.values().iter().map(|&k| unsafe { | ||
// SAFETY: Arrow invariant that all keys are in range of values | ||
values_validity.get_bit_unchecked(k as usize) | ||
})); | ||
|
||
PrimitiveArray::new(UInt32, values, Some(validity)) | ||
}, | ||
Some(keys_validity) => { | ||
let values = keys | ||
.values() | ||
.iter() | ||
.map(|&k| { | ||
// deal with nulls in keys | ||
let idx = (k as usize).min(num_values); | ||
// SAFETY: Arrow invariant that all keys are in range of values | ||
*unsafe { new_idx_map.get_unchecked(idx) } | ||
}) | ||
.collect(); | ||
let propagated_validity = Bitmap::from_iter(keys.values().iter().map(|&k| { | ||
// deal with nulls in keys | ||
let idx = (k as usize).min(num_values); | ||
// SAFETY: Arrow invariant that all keys are in range of values | ||
unsafe { values_validity.get_bit_unchecked(idx) } | ||
})); | ||
|
||
let validity = &propagated_validity & keys_validity; | ||
PrimitiveArray::new(UInt32, values, Some(validity)) | ||
}, | ||
}; | ||
|
||
// Filter only handles binary | ||
let values = values.to_binview(); | ||
|
||
// Filter out the null values | ||
let values = crate::filter::filter_with_bitmap(&values, values_validity); | ||
let values = values.as_any().downcast_ref::<BinaryViewArray>().unwrap(); | ||
let values = unsafe { values.to_utf8view_unchecked() }.clone(); | ||
|
||
// Explicitly set the values validity to none. | ||
assert_eq!(values.null_count(), 0); | ||
let values = values.with_validity(None); | ||
|
||
(keys, values) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.