diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 8138549b1a0e..ce593cd2b04d 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -1199,8 +1199,20 @@ impl TryFrom<(i128, u8)> for Variant<'_, '_> { #[cfg(test)] mod tests { + use super::*; + #[test] + fn test_empty_variant_will_fail() { + let metadata = VariantMetadata::try_new(&[1, 0, 0]).unwrap(); + + let err = Variant::try_new_with_metadata(metadata, &[]).unwrap_err(); + + assert!(matches!( + err, + ArrowError::InvalidArgumentError(ref msg) if msg == "Received empty bytes")); + } + #[test] fn test_construct_short_string() { let short_string = ShortString::try_new("norm").expect("should fit in short string"); diff --git a/parquet-variant/src/variant/list.rs b/parquet-variant/src/variant/list.rs index 17f87a2e0d7a..6de6ed830720 100644 --- a/parquet-variant/src/variant/list.rs +++ b/parquet-variant/src/variant/list.rs @@ -216,28 +216,19 @@ impl<'m, 'v> VariantList<'m, 'v> { self.header.first_offset_byte() as _..self.first_value_byte as _, )?; - let offsets = - map_bytes_to_offsets(offset_buffer, self.header.offset_size).collect::>(); - - // Validate offsets are in-bounds and monotonically increasing. - // Since shallow verification checks whether the first and last offsets are in-bounds, - // we can also verify all offsets are in-bounds by checking if offsets are monotonically increasing. - let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b); - if !are_offsets_monotonic { - return Err(ArrowError::InvalidArgumentError( - "offsets are not monotonically increasing".to_string(), - )); - } - let value_buffer = slice_from_slice(self.value, self.first_value_byte as _..)?; // Validate whether values are valid variant objects - for i in 1..offsets.len() { - let start_offset = offsets[i - 1]; - let end_offset = offsets[i]; - - let value_bytes = slice_from_slice(value_buffer, start_offset..end_offset)?; + // + // Since we use offsets to slice into the value buffer, this also verifies all offsets are in-bounds + // and monotonically increasing + let mut offset_iter = map_bytes_to_offsets(offset_buffer, self.header.offset_size); + let mut current_offset = offset_iter.next().unwrap_or(0); + + for next_offset in offset_iter { + let value_bytes = slice_from_slice(value_buffer, current_offset..next_offset)?; Variant::try_new_with_metadata(self.metadata.clone(), value_bytes)?; + current_offset = next_offset; } self.validated = true; diff --git a/parquet-variant/src/variant/metadata.rs b/parquet-variant/src/variant/metadata.rs index 007122af7599..9653473b10e4 100644 --- a/parquet-variant/src/variant/metadata.rs +++ b/parquet-variant/src/variant/metadata.rs @@ -237,22 +237,15 @@ impl<'m> VariantMetadata<'m> { let offsets = map_bytes_to_offsets(offset_bytes, self.header.offset_size).collect::>(); - // Validate offsets are in-bounds and monotonically increasing. - // Since shallow validation ensures the first and last offsets are in bounds, we can also verify all offsets - // are in-bounds by checking if offsets are monotonically increasing. - let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b); - if !are_offsets_monotonic { - return Err(ArrowError::InvalidArgumentError( - "offsets not monotonically increasing".to_string(), - )); - } - // Verify the string values in the dictionary are UTF-8 encoded strings. let value_buffer = string_from_slice(self.bytes, 0, self.first_value_byte as _..self.bytes.len())?; if self.header.is_sorted { // Validate the dictionary values are unique and lexicographically sorted + // + // Since we use the offsets to access dictionary values, this also validates + // offsets are in-bounds and monotonically increasing let are_dictionary_values_unique_and_sorted = (1..offsets.len()) .map(|i| { let field_range = offsets[i - 1]..offsets[i]; @@ -268,6 +261,18 @@ impl<'m> VariantMetadata<'m> { "dictionary values are not unique and ordered".to_string(), )); } + } else { + // Validate offsets are in-bounds and monotonically increasing + // + // Since shallow validation ensures the first and last offsets are in bounds, + // we can also verify all offsets are in-bounds by checking if + // offsets are monotonically increasing + let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b); + if !are_offsets_monotonic { + return Err(ArrowError::InvalidArgumentError( + "offsets not monotonically increasing".to_string(), + )); + } } self.validated = true; diff --git a/parquet-variant/src/variant/object.rs b/parquet-variant/src/variant/object.rs index dd6da08fbe64..e2c6cb7b79ed 100644 --- a/parquet-variant/src/variant/object.rs +++ b/parquet-variant/src/variant/object.rs @@ -242,6 +242,8 @@ impl<'m, 'v> VariantObject<'m, 'v> { } else { // The metadata dictionary can't guarantee uniqueness or sortedness, so we have to parse out the corresponding field names // to check lexicographical order + // + // Since we are probing the metadata dictionary by field id, this also verifies field ids are in-bounds let are_field_names_sorted = field_ids .iter() .map(|&i| self.metadata.get(i)) @@ -253,19 +255,6 @@ impl<'m, 'v> VariantObject<'m, 'v> { "field names not sorted".to_string(), )); } - - // Since field ids are not guaranteed to be sorted, scan over all field ids - // and check that field ids are less than dictionary size - - let are_field_ids_in_bounds = field_ids - .iter() - .all(|&id| id < self.metadata.dictionary_size()); - - if !are_field_ids_in_bounds { - return Err(ArrowError::InvalidArgumentError( - "field id is not valid".to_string(), - )); - } } // Validate whether values are valid variant objects