Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improvements to UTF-8 statistics truncation #6870

Merged
merged 11 commits into from
Dec 16, 2024
138 changes: 110 additions & 28 deletions parquet/src/column/writer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1444,15 +1444,31 @@ fn increment(mut data: Vec<u8>) -> Option<Vec<u8>> {
/// Try and increment the the string's bytes from right to left, returning when the result
/// is a valid UTF8 string. Returns `None` when it can't increment any byte.
fn increment_utf8(mut data: Vec<u8>) -> Option<Vec<u8>> {
const UTF8_CONTINUATION: u8 = 0x80;
const UTF8_CONTINUATION_MASK: u8 = 0xc0;

let mut len = data.len();
for idx in (0..data.len()).rev() {
let original = data[idx];
let (byte, overflow) = original.overflowing_add(1);
if !overflow {
data[idx] = byte;
if str::from_utf8(&data).is_ok() {
etseidl marked this conversation as resolved.
Show resolved Hide resolved
if len != data.len() {
data.truncate(len);
}
return Some(data);
}
data[idx] = original;
// Incrementing "original" did not yield a valid unicode character, so it overflowed
// its available bits. If it was a continuation byte (b10xxxxxx) then set to min
// continuation (b10000000). Otherwise it was the first byte so set reset the first
// byte back to its original value (so data remains a valid string) and reduce "len".
if original & UTF8_CONTINUATION_MASK == UTF8_CONTINUATION {
etseidl marked this conversation as resolved.
Show resolved Hide resolved
etseidl marked this conversation as resolved.
Show resolved Hide resolved
data[idx] = UTF8_CONTINUATION;
} else {
data[idx] = original;
len = idx;
}
}
}

Expand All @@ -1462,6 +1478,7 @@ fn increment_utf8(mut data: Vec<u8>) -> Option<Vec<u8>> {
#[cfg(test)]
mod tests {
use crate::file::properties::DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH;
use core::str;
use rand::distributions::uniform::SampleUniform;
use std::sync::Arc;

Expand Down Expand Up @@ -3140,39 +3157,67 @@ mod tests {

#[test]
fn test_increment_utf8() {
let test_inc = |o: &str, expected: &str| {
if let Ok(v) = String::from_utf8(increment_utf8(o.as_bytes().to_vec()).unwrap()) {
// Got the expected result...
assert_eq!(v, expected);
// and it's greater than the original string
assert!(*v > *o);
// Also show that BinaryArray level comparison works here
let mut greater = ByteArray::new();
greater.set_data(Bytes::from(v));
let mut original = ByteArray::new();
original.set_data(Bytes::from(o.as_bytes().to_vec()));
assert!(greater > original);
etseidl marked this conversation as resolved.
Show resolved Hide resolved
} else {
panic!("Expected incremented UTF8 string to also be valid.");
}
};

// Basic ASCII case
let v = increment_utf8("hello".as_bytes().to_vec()).unwrap();
assert_eq!(&v, "hellp".as_bytes());
test_inc("hello", "hellp");

// 1-byte ending in max 1-byte
test_inc("a\u{7f}", "b");

// Also show that BinaryArray level comparison works here
let mut greater = ByteArray::new();
greater.set_data(Bytes::from(v));
let mut original = ByteArray::new();
original.set_data(Bytes::from("hello".as_bytes().to_vec()));
assert!(greater > original);
// 1-byte max should not truncate as it would need 2-byte code points
assert!(increment_utf8("\u{7f}\u{7f}".as_bytes().to_vec()).is_none());

// UTF8 string
let s = "❤️🧡💛💚💙💜";
let v = increment_utf8(s.as_bytes().to_vec()).unwrap();
test_inc("❤️🧡💛💚💙💜", "❤️🧡💛💚💙💝");

if let Ok(new) = String::from_utf8(v) {
assert_ne!(&new, s);
assert_eq!(new, "❤️🧡💛💚💙💝");
assert!(new.as_bytes().last().unwrap() > s.as_bytes().last().unwrap());
} else {
panic!("Expected incremented UTF8 string to also be valid.")
}
// 2-byte without overflow
test_inc("éééé", "éééê");

// Max UTF8 character - should be a No-Op
let s = char::MAX.to_string();
assert_eq!(s.len(), 4);
let v = increment_utf8(s.as_bytes().to_vec());
assert!(v.is_none());
// 2-byte that overflows lowest byte
test_inc("\u{ff}\u{ff}", "\u{ff}\u{100}");
etseidl marked this conversation as resolved.
Show resolved Hide resolved

// 2-byte ending in max 2-byte
test_inc("a\u{7ff}", "b");

// Max 2-byte should not truncate as it would need 3-byte code points
assert!(increment_utf8("\u{7ff}\u{7ff}".as_bytes().to_vec()).is_none());

// 3-byte without overflow
test_inc("ࠀࠀ", "ࠀࠁ");
alamb marked this conversation as resolved.
Show resolved Hide resolved

// 3-byte ending in max 3-byte
test_inc("a\u{ffff}", "b");

// Max 3-byte should not truncate as it would need 4-byte code points
assert!(increment_utf8("\u{ffff}\u{ffff}".as_bytes().to_vec()).is_none());

// Handle multi-byte UTF8 characters
let s = "a\u{10ffff}";
let v = increment_utf8(s.as_bytes().to_vec());
assert_eq!(&v.unwrap(), "b\u{10ffff}".as_bytes());
// 4-byte without overflow
test_inc("𐀀𐀀", "𐀀𐀁");

// 4-byte ending in max unicode
test_inc("a\u{10ffff}", "b");

// Max 4-byte should not truncate
assert!(increment_utf8("\u{10ffff}\u{10ffff}".as_bytes().to_vec()).is_none());

// Skip over surrogate pair range (0xD800..=0xDFFF)
test_inc("a\u{D7FF}", "a\u{e000}");
}

#[test]
Expand All @@ -3182,7 +3227,6 @@ mod tests {
let r = truncate_utf8(data, data.as_bytes().len()).unwrap();
assert_eq!(r.len(), data.as_bytes().len());
assert_eq!(&r, data.as_bytes());
println!("len is {}", data.len());

// We slice it away from the UTF8 boundary
let r = truncate_utf8(data, 13).unwrap();
Expand All @@ -3192,6 +3236,44 @@ mod tests {
// One multi-byte code point, and a length shorter than it, so we can't slice it
let r = truncate_utf8("\u{0836}", 1);
assert!(r.is_none());

// test truncate and increment for max bounds on utf-8 stats
// 7-bit (i.e. ASCII)
let r = truncate_utf8("yyyyyyyyy", 8)
.and_then(increment_utf8)
.unwrap();
assert_eq!(&r, "yyyyyyyz".as_bytes());

// 2-byte without overflow
let r = truncate_utf8("ééééé", 8).and_then(increment_utf8).unwrap();
assert_eq!(&r, "éééê".as_bytes());

// 2-byte that overflows lowest byte
let r = truncate_utf8("\u{ff}\u{ff}\u{ff}\u{ff}\u{ff}", 8)
.and_then(increment_utf8)
.unwrap();
assert_eq!(&r, "\u{ff}\u{ff}\u{ff}\u{100}".as_bytes());

// max 2-byte should not truncate as it would need 3-byte code points
let r = truncate_utf8("߿߿߿߿߿", 8).and_then(increment_utf8);
assert!(r.is_none());

// 3-byte without overflow
let r = truncate_utf8("ࠀࠀࠀ", 8).and_then(increment_utf8).unwrap();
assert_eq!(&r, "ࠀࠁ".as_bytes());
assert_eq!(r.len(), 6);

// max 3-byte should not truncate as it would need 4-byte code points
let r = truncate_utf8("\u{ffff}\u{ffff}\u{ffff}", 8).and_then(increment_utf8);
assert!(r.is_none());

// 4-byte without overflow
let r = truncate_utf8("𐀀𐀀𐀀", 8).and_then(increment_utf8).unwrap();
assert_eq!(&r, "𐀀𐀁".as_bytes());

// max 4-byte should not truncate
let r = truncate_utf8("\u{10ffff}\u{10ffff}", 8).and_then(increment_utf8);
assert!(r.is_none());
}

#[test]
Expand Down
Loading