From 5595019bad5f7fa8c75106d31bf86c8c99d2822c Mon Sep 17 00:00:00 2001 From: ByteBaker <42913098+ByteBaker@users.noreply.github.com> Date: Wed, 2 Oct 2024 15:39:51 +0530 Subject: [PATCH] chore: add docs, part of #37 (#6496) - add pragma `#![warn(missing_docs)]` to `parquet` This is the final component in the effort to make Arrow fully-documented. The entire project now generates warning for missing docs, if any. - `arrow-flight`: replace `tonic`'s deprecated `compile_with_config` with suggested method - new deprecation: The following types were not used anywhere and were possibly strays. They've been marked as deprecated and will be removed in future versions. - `parquet::data_types::SliceAsBytesDataType` - `parquet::column::writer::Level` --- arrow-flight/gen/src/main.rs | 6 +-- parquet/src/arrow/async_reader/metadata.rs | 1 + parquet/src/basic.rs | 47 ++++++++++++++++- parquet/src/column/page.rs | 28 ++++++++++ parquet/src/column/reader.rs | 8 +++ parquet/src/column/writer/mod.rs | 15 ++++++ parquet/src/data_type.rs | 22 ++++++++ parquet/src/errors.rs | 2 + parquet/src/file/footer.rs | 2 + parquet/src/file/metadata/mod.rs | 8 +++ parquet/src/file/metadata/writer.rs | 1 + parquet/src/file/page_index/index.rs | 28 ++++++++++ parquet/src/file/page_index/offset_index.rs | 3 ++ parquet/src/file/properties.rs | 2 + parquet/src/file/reader.rs | 2 +- parquet/src/file/statistics.rs | 14 ++++- parquet/src/file/writer.rs | 1 + parquet/src/lib.rs | 3 +- parquet/src/record/api.rs | 57 +++++++++++++++++++-- parquet/src/record/reader.rs | 16 +++--- parquet/src/record/record_reader.rs | 3 +- parquet/src/schema/types.rs | 11 ++++ parquet/src/schema/visitor.rs | 2 + parquet/src/thrift.rs | 2 + 24 files changed, 262 insertions(+), 22 deletions(-) diff --git a/arrow-flight/gen/src/main.rs b/arrow-flight/gen/src/main.rs index c4cb9dfec5ad..a69134e7acbe 100644 --- a/arrow-flight/gen/src/main.rs +++ b/arrow-flight/gen/src/main.rs @@ -26,7 +26,7 @@ fn main() -> Result<(), Box> { let proto_path = Path::new("../format/Flight.proto"); tonic_build::configure() - // protoc in unbuntu builder needs this option + // protoc in Ubuntu builder needs this option .protoc_arg("--experimental_allow_proto3_optional") .out_dir("src") .compile_protos_with_config(prost_config(), &[proto_path], &[proto_dir])?; @@ -37,7 +37,7 @@ fn main() -> Result<(), Box> { .open("src/arrow.flight.protocol.rs")?; let mut buffer = String::new(); file.read_to_string(&mut buffer)?; - // append warning that file was auto-generate + // append warning that file was auto-generated let mut file = OpenOptions::new() .write(true) .truncate(true) @@ -49,7 +49,7 @@ fn main() -> Result<(), Box> { let proto_path = Path::new("../format/FlightSql.proto"); tonic_build::configure() - // protoc in ubuntu builder needs this option + // protoc in Ubuntu builder needs this option .protoc_arg("--experimental_allow_proto3_optional") .out_dir("src/sql") .compile_protos_with_config(prost_config(), &[proto_path], &[proto_dir])?; diff --git a/parquet/src/arrow/async_reader/metadata.rs b/parquet/src/arrow/async_reader/metadata.rs index b7fac6fe7c05..b2c6159bee3a 100644 --- a/parquet/src/arrow/async_reader/metadata.rs +++ b/parquet/src/arrow/async_reader/metadata.rs @@ -29,6 +29,7 @@ use std::ops::Range; /// A data source that can be used with [`MetadataLoader`] to load [`ParquetMetaData`] pub trait MetadataFetch { + /// Fetches a range of bytes asynchronously fn fetch(&mut self, range: Range) -> BoxFuture<'_, Result>; } diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 8fde542f59c8..1926b87623bf 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -47,13 +47,21 @@ pub use crate::format::{ #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] #[allow(non_camel_case_types)] pub enum Type { + /// A boolean value. BOOLEAN, + /// 32-bit signed integer. INT32, + /// 64-bit signed integer. INT64, + /// 96-bit signed integer for timestamps. INT96, + /// IEEE 754 single-precision floating point value. FLOAT, + /// IEEE 754 double-precision floating point value. DOUBLE, + /// Arbitrary length byte array. BYTE_ARRAY, + /// Fixed length byte array. FIXED_LEN_BYTE_ARRAY, } @@ -70,6 +78,7 @@ pub enum Type { #[derive(Debug, Clone, Copy, PartialEq, Eq)] #[allow(non_camel_case_types)] pub enum ConvertedType { + /// No type conversion. NONE, /// A BYTE_ARRAY actually contains UTF8 encoded chars. UTF8, @@ -171,31 +180,53 @@ pub enum ConvertedType { /// [`ConvertedType`]. Please see the README.md for more details. #[derive(Debug, Clone, PartialEq, Eq)] pub enum LogicalType { + /// A UTF8 encoded string. String, + /// A map of key-value pairs. Map, + /// A list of elements. List, + /// A set of predefined values. Enum, + /// A decimal value with a specified scale and precision. Decimal { + /// The number of digits in the decimal. scale: i32, + /// The location of the decimal point. precision: i32, }, + /// A date stored as days since Unix epoch. Date, + /// A time stored as [`TimeUnit`] since midnight. Time { + /// Whether the time is adjusted to UTC. is_adjusted_to_u_t_c: bool, + /// The unit of time. unit: TimeUnit, }, + /// A timestamp stored as [`TimeUnit`] since Unix epoch. Timestamp { + /// Whether the timestamp is adjusted to UTC. is_adjusted_to_u_t_c: bool, + /// The unit of time. unit: TimeUnit, }, + /// An integer with a specified bit width and signedness. Integer { + /// The number of bits in the integer. bit_width: i8, + /// Whether the integer is signed. is_signed: bool, }, + /// An unknown logical type. Unknown, + /// A JSON document. Json, + /// A BSON document. Bson, + /// A UUID. Uuid, + /// A 16-bit floating point number. Float16, } @@ -350,13 +381,21 @@ impl FromStr for Encoding { #[derive(Debug, Clone, Copy, PartialEq, Eq)] #[allow(non_camel_case_types)] pub enum Compression { + /// No compression. UNCOMPRESSED, + /// [Snappy compression](https://en.wikipedia.org/wiki/Snappy_(compression)) SNAPPY, + /// [Gzip compression](https://www.ietf.org/rfc/rfc1952.txt) GZIP(GzipLevel), + /// [LZO compression](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Oberhumer) LZO, + /// [Brotli compression](https://datatracker.ietf.org/doc/html/rfc7932) BROTLI(BrotliLevel), + /// [LZ4 compression](https://lz4.org/), [(deprecated)](https://issues.apache.org/jira/browse/PARQUET-2032) LZ4, + /// [ZSTD compression](https://datatracker.ietf.org/doc/html/rfc8878) ZSTD(ZstdLevel), + /// [LZ4 compression](https://lz4.org/). LZ4_RAW, } @@ -447,16 +486,20 @@ impl FromStr for Compression { } // ---------------------------------------------------------------------- -// Mirrors `parquet::PageType` - +/// Mirrors [parquet::PageType] +/// /// Available data pages for Parquet file format. /// Note that some of the page types may not be supported. #[derive(Debug, Clone, Copy, PartialEq, Eq)] #[allow(non_camel_case_types)] pub enum PageType { + /// Data page Parquet 1.0 DATA_PAGE, + /// Index page INDEX_PAGE, + /// Dictionary page DICTIONARY_PAGE, + /// Data page Parquet 2.0 DATA_PAGE_V2, } diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs index e3931dfe9e2b..5c866318e185 100644 --- a/parquet/src/column/page.rs +++ b/parquet/src/column/page.rs @@ -31,29 +31,51 @@ use crate::format::PageHeader; /// used to store uncompressed bytes of the page. #[derive(Clone)] pub enum Page { + /// Data page Parquet format v1. DataPage { + /// The underlying data buffer buf: Bytes, + /// Number of values in this page num_values: u32, + /// Encoding for values in this page encoding: Encoding, + /// Definition level encoding def_level_encoding: Encoding, + /// Repetition level encoding rep_level_encoding: Encoding, + /// Optional statistics for this page statistics: Option, }, + /// Data page Parquet format v2. DataPageV2 { + /// The underlying data buffer buf: Bytes, + /// Number of values in this page num_values: u32, + /// Encoding for values in this page encoding: Encoding, + /// Number of null values in this page num_nulls: u32, + /// Number of rows in this page num_rows: u32, + /// Length of definition levels def_levels_byte_len: u32, + /// Length of repetition levels rep_levels_byte_len: u32, + /// Is this page compressed is_compressed: bool, + /// Optional statistics for this page statistics: Option, }, + /// Dictionary page. DictionaryPage { + /// The underlying data buffer buf: Bytes, + /// Number of values in this page num_values: u32, + /// Encoding for values in this page encoding: Encoding, + /// Is dictionary page sorted is_sorted: bool, }, } @@ -235,11 +257,17 @@ impl CompressedPage { /// Contains page write metrics. pub struct PageWriteSpec { + /// The type of page being written pub page_type: PageType, + /// The total size of the page, before compression pub uncompressed_size: usize, + /// The compressed size of the page pub compressed_size: usize, + /// The number of values in the page pub num_values: u32, + /// The offset of the page in the column chunk pub offset: u64, + /// The number of bytes written to the underlying sink pub bytes_written: u64, } diff --git a/parquet/src/column/reader.rs b/parquet/src/column/reader.rs index 0c7cbb412a42..2b43b4c3e45c 100644 --- a/parquet/src/column/reader.rs +++ b/parquet/src/column/reader.rs @@ -34,13 +34,21 @@ pub(crate) mod decoder; /// Column reader for a Parquet type. pub enum ColumnReader { + /// Column reader for boolean type BoolColumnReader(ColumnReaderImpl), + /// Column reader for int32 type Int32ColumnReader(ColumnReaderImpl), + /// Column reader for int64 type Int64ColumnReader(ColumnReaderImpl), + /// Column reader for int96 type Int96ColumnReader(ColumnReaderImpl), + /// Column reader for float type FloatColumnReader(ColumnReaderImpl), + /// Column reader for double type DoubleColumnReader(ColumnReaderImpl), + /// Column reader for byte array type ByteArrayColumnReader(ColumnReaderImpl), + /// Column reader for fixed length byte array type FixedLenByteArrayColumnReader(ColumnReaderImpl), } diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index e0d3abed81ac..6071b68c62a5 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -61,13 +61,21 @@ macro_rules! downcast_writer { /// Column writer for a Parquet type. pub enum ColumnWriter<'a> { + /// Column writer for boolean type BoolColumnWriter(ColumnWriterImpl<'a, BoolType>), + /// Column writer for int32 type Int32ColumnWriter(ColumnWriterImpl<'a, Int32Type>), + /// Column writer for int64 type Int64ColumnWriter(ColumnWriterImpl<'a, Int64Type>), + /// Column writer for int96 (timestamp) type Int96ColumnWriter(ColumnWriterImpl<'a, Int96Type>), + /// Column writer for float type FloatColumnWriter(ColumnWriterImpl<'a, FloatType>), + /// Column writer for double type DoubleColumnWriter(ColumnWriterImpl<'a, DoubleType>), + /// Column writer for byte array type ByteArrayColumnWriter(ColumnWriterImpl<'a, ByteArrayType>), + /// Column writer for fixed length byte array type FixedLenByteArrayColumnWriter(ColumnWriterImpl<'a, FixedLenByteArrayType>), } @@ -90,6 +98,11 @@ impl<'a> ColumnWriter<'a> { } } +#[deprecated( + since = "54.0.0", + note = "Seems like a stray and nobody knows what's it for. Will be removed in the next release." +)] +#[allow(missing_docs)] pub enum Level { Page, Column, @@ -309,6 +322,7 @@ impl ColumnMetrics { /// Typed column writer for a primitive column. pub type ColumnWriterImpl<'a, T> = GenericColumnWriter<'a, ColumnValueEncoderImpl>; +/// Generic column writer for a primitive column. pub struct GenericColumnWriter<'a, E: ColumnValueEncoder> { // Column writer properties descr: ColumnDescPtr, @@ -344,6 +358,7 @@ pub struct GenericColumnWriter<'a, E: ColumnValueEncoder> { } impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { + /// Returns a new instance of [`GenericColumnWriter`]. pub fn new( descr: ColumnDescPtr, props: WriterPropertiesPtr, diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index a3bcfd16730f..a3d0e3ce78f7 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -183,6 +183,7 @@ impl ByteArray { ) } + /// Try to convert the byte array to a utf8 slice pub fn as_utf8(&self) -> Result<&str> { self.data .as_ref() @@ -349,20 +350,29 @@ impl From for ByteArray { pub enum Decimal { /// Decimal backed by `i32`. Int32 { + /// The underlying value value: [u8; 4], + /// The total number of digits in the number precision: i32, + /// The number of digits to the right of the decimal point scale: i32, }, /// Decimal backed by `i64`. Int64 { + /// The underlying value value: [u8; 8], + /// The total number of digits in the number precision: i32, + /// The number of digits to the right of the decimal point scale: i32, }, /// Decimal backed by byte array. Bytes { + /// The underlying value value: ByteArray, + /// The total number of digits in the number precision: i32, + /// The number of digits to the right of the decimal point scale: i32, }, } @@ -1120,6 +1130,7 @@ pub(crate) mod private { /// Contains the Parquet physical type information as well as the Rust primitive type /// presentation. pub trait DataType: 'static + Send { + /// The physical type of the Parquet data type. type T: private::ParquetValueType; /// Returns Parquet physical type. @@ -1130,20 +1141,24 @@ pub trait DataType: 'static + Send { /// Returns size in bytes for Rust representation of the physical type. fn get_type_size() -> usize; + /// Returns the underlying [`ColumnReaderImpl`] for the given [`ColumnReader`]. fn get_column_reader(column_writer: ColumnReader) -> Option> where Self: Sized; + /// Returns the underlying [`ColumnWriterImpl`] for the given [`ColumnWriter`]. fn get_column_writer(column_writer: ColumnWriter<'_>) -> Option> where Self: Sized; + /// Returns a reference to the underlying [`ColumnWriterImpl`] for the given [`ColumnWriter`]. fn get_column_writer_ref<'a, 'b: 'a>( column_writer: &'b ColumnWriter<'a>, ) -> Option<&'b ColumnWriterImpl<'a, Self>> where Self: Sized; + /// Returns a mutable reference to the underlying [`ColumnWriterImpl`] for the given fn get_column_writer_mut<'a, 'b: 'a>( column_writer: &'a mut ColumnWriter<'b>, ) -> Option<&'a mut ColumnWriterImpl<'b, Self>> @@ -1152,12 +1167,18 @@ pub trait DataType: 'static + Send { } // Workaround bug in specialization +#[deprecated( + since = "54.0.0", + note = "Seems like a stray and nobody knows what's it for. Will be removed in 55.0.0" +)] +#[allow(missing_docs)] pub trait SliceAsBytesDataType: DataType where Self::T: SliceAsBytes, { } +#[allow(deprecated)] impl SliceAsBytesDataType for T where T: DataType, @@ -1167,6 +1188,7 @@ where macro_rules! make_type { ($name:ident, $reader_ident: ident, $writer_ident: ident, $native_ty:ty, $size:expr) => { + #[doc = concat!("Parquet physical type: ", stringify!($name))] #[derive(Clone)] pub struct $name {} diff --git a/parquet/src/errors.rs b/parquet/src/errors.rs index a242c9768514..bb4d2543c7b4 100644 --- a/parquet/src/errors.rs +++ b/parquet/src/errors.rs @@ -42,6 +42,8 @@ pub enum ParquetError { /// Arrow error. /// Returned when reading into arrow or writing from arrow. ArrowError(String), + /// Error when the requested column index is more than the + /// number of columns in the row group IndexOutOfBound(usize, usize), /// An external error variant External(Box), diff --git a/parquet/src/file/footer.rs b/parquet/src/file/footer.rs index 3dd698e3d443..bd31c9142f56 100644 --- a/parquet/src/file/footer.rs +++ b/parquet/src/file/footer.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! Module for working with Parquet file footers. + use crate::errors::Result; use crate::file::{metadata::*, reader::ChunkReader, FOOTER_SIZE}; diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 30b17b6a2f78..5a2ccbc0241f 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -419,6 +419,7 @@ impl From for ParquetMetaDataBuilder { } } +/// A key-value pair for [`FileMetaData`]. pub type KeyValue = crate::format::KeyValue; /// Reference counted pointer for [`FileMetaData`]. @@ -722,6 +723,7 @@ impl RowGroupMetaDataBuilder { self } + /// Sets file offset for this row group. pub fn set_file_offset(mut self, value: i64) -> Self { self.0.file_offset = Some(value); self @@ -1409,6 +1411,7 @@ impl Default for ColumnIndexBuilder { } impl ColumnIndexBuilder { + /// Creates a new column index builder. pub fn new() -> Self { ColumnIndexBuilder { null_pages: Vec::new(), @@ -1458,6 +1461,7 @@ impl ColumnIndexBuilder { } } + /// Set the boundary order of the column index pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) { self.boundary_order = boundary_order; } @@ -1506,6 +1510,7 @@ impl Default for OffsetIndexBuilder { } impl OffsetIndexBuilder { + /// Creates a new offset index builder. pub fn new() -> Self { OffsetIndexBuilder { offset_array: Vec::new(), @@ -1516,17 +1521,20 @@ impl OffsetIndexBuilder { } } + /// Append the row count of the next page. pub fn append_row_count(&mut self, row_count: i64) { let current_page_row_index = self.current_first_row_index; self.first_row_index_array.push(current_page_row_index); self.current_first_row_index += row_count; } + /// Append the offset and size of the next page. pub fn append_offset_and_size(&mut self, offset: i64, compressed_page_size: i32) { self.offset_array.push(offset); self.compressed_page_size_array.push(compressed_page_size); } + /// Append the unencoded byte array data bytes of the next page. pub fn append_unencoded_byte_array_data_bytes( &mut self, unencoded_byte_array_data_bytes: Option, diff --git a/parquet/src/file/metadata/writer.rs b/parquet/src/file/metadata/writer.rs index 44328c635fed..69a939e00f55 100644 --- a/parquet/src/file/metadata/writer.rs +++ b/parquet/src/file/metadata/writer.rs @@ -286,6 +286,7 @@ impl<'a, W: Write> ParquetMetaDataWriter<'a, W> { Self { buf, metadata } } + /// Write the metadata to the buffer pub fn finish(mut self) -> Result<()> { let file_metadata = self.metadata.file_metadata(); diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index 2f30abead25c..a66509e14c7a 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -50,18 +50,31 @@ pub struct PageIndex { } impl PageIndex { + /// Returns the minimum value in the page + /// + /// It is `None` when all values are null pub fn min(&self) -> Option<&T> { self.min.as_ref() } + + /// Returns the maximum value in the page + /// + /// It is `None` when all values are null pub fn max(&self) -> Option<&T> { self.max.as_ref() } + + /// Returns the number of null values in the page pub fn null_count(&self) -> Option { self.null_count } + + /// Returns the repetition level histogram for the page pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> { self.repetition_level_histogram.as_ref() } + + /// Returns the definition level histogram for the page pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> { self.definition_level_histogram.as_ref() } @@ -71,10 +84,16 @@ impl PageIndex where T: AsBytes, { + /// Returns the minimum value in the page as bytes + /// + /// It is `None` when all values are null pub fn max_bytes(&self) -> Option<&[u8]> { self.max.as_ref().map(|x| x.as_bytes()) } + /// Returns the maximum value in the page as bytes + /// + /// It is `None` when all values are null pub fn min_bytes(&self) -> Option<&[u8]> { self.min.as_ref().map(|x| x.as_bytes()) } @@ -90,13 +109,21 @@ pub enum Index { /// will only return pageLocations without min_max index, /// `NONE` represents this lack of index information NONE, + /// Boolean type index BOOLEAN(NativeIndex), + /// 32-bit integer type index INT32(NativeIndex), + /// 64-bit integer type index INT64(NativeIndex), + /// 96-bit integer type (timestamp) index INT96(NativeIndex), + /// 32-bit floating point type index FLOAT(NativeIndex), + /// 64-bit floating point type index DOUBLE(NativeIndex), + /// Byte array type index BYTE_ARRAY(NativeIndex), + /// Fixed length byte array type index FIXED_LEN_BYTE_ARRAY(NativeIndex), } @@ -155,6 +182,7 @@ pub struct NativeIndex { } impl NativeIndex { + /// The physical data type of the column pub const PHYSICAL_TYPE: Type = T::PHYSICAL_TYPE; /// Creates a new [`NativeIndex`] diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs index 2ae3464141ca..d48d1b6c083d 100644 --- a/parquet/src/file/page_index/offset_index.rs +++ b/parquet/src/file/page_index/offset_index.rs @@ -24,7 +24,10 @@ use crate::format::{OffsetIndex, PageLocation}; /// in the chunk. Optionally stores fully decoded page sizes for BYTE_ARRAY columns. #[derive(Debug, Clone, PartialEq)] pub struct OffsetIndexMetaData { + /// Vector of [`PageLocation`] objects, one per page in the chunk. pub page_locations: Vec, + /// Optional vector of unencoded page sizes, one per page in the chunk. + /// Only defined for BYTE_ARRAY columns. pub unencoded_byte_array_data_bytes: Option>, } diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index 61f6390c97d4..efcb63258f99 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -64,7 +64,9 @@ pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option = None; #[derive(Debug, Clone, Copy, PartialEq, Eq)] #[allow(non_camel_case_types)] pub enum WriterVersion { + /// Parquet format version 1.0 PARQUET_1_0, + /// Parquet format version 2.0 PARQUET_2_0, } diff --git a/parquet/src/file/reader.rs b/parquet/src/file/reader.rs index d8a61fafe3d7..400441f0c9cd 100644 --- a/parquet/src/file/reader.rs +++ b/parquet/src/file/reader.rs @@ -61,7 +61,7 @@ pub trait Length { /// User provided implementations can implement more sophisticated behaviors /// such as on-demand buffering or scan sharing. pub trait ChunkReader: Length + Send + Sync { - /// The concrete type of readers returned by this trait + /// The concrete type of reader returned by this trait type T: Read; /// Get a [`Read`] instance starting at the provided file offset diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index 50ed06436d86..2e05b83369cf 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -81,9 +81,10 @@ pub(crate) mod private { gen_make_statistics!(FixedLenByteArray, FixedLenByteArray); } -// Macro to generate methods create Statistics. +/// Macro to generate methods to create Statistics. macro_rules! statistics_new_func { ($func:ident, $vtype:ty, $stat:ident) => { + #[doc = concat!("Creates new statistics for `", stringify!($stat), "` column type.")] pub fn $func( min: $vtype, max: $vtype, @@ -244,7 +245,7 @@ pub fn from_thrift( }) } -// Convert Statistics into Thrift definition. +/// Convert Statistics into Thrift definition. pub fn to_thrift(stats: Option<&Statistics>) -> Option { let stats = stats?; @@ -306,13 +307,21 @@ pub fn to_thrift(stats: Option<&Statistics>) -> Option { /// [NativeIndex]: crate::file::page_index::index::NativeIndex #[derive(Debug, Clone, PartialEq)] pub enum Statistics { + /// Statistics for Boolean column Boolean(ValueStatistics), + /// Statistics for Int32 column Int32(ValueStatistics), + /// Statistics for Int64 column Int64(ValueStatistics), + /// Statistics for Int96 column Int96(ValueStatistics), + /// Statistics for Float column Float(ValueStatistics), + /// Statistics for Double column Double(ValueStatistics), + /// Statistics for ByteArray column ByteArray(ValueStatistics), + /// Statistics for FixedLenByteArray column FixedLenByteArray(ValueStatistics), } @@ -323,6 +332,7 @@ impl From> for Statistics { } impl Statistics { + /// Creates new statistics for a column type pub fn new( min: Option, max: Option, diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 7b7bfa19c346..afbe1e549f56 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -322,6 +322,7 @@ impl SerializedFileWriter { } } + /// Add a [`KeyValue`] to the file writer's metadata pub fn append_key_value_metadata(&mut self, kv_metadata: KeyValue) { self.kv_metadatas.push(kv_metadata); } diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index a54d4a427635..3b63845e709c 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -82,6 +82,7 @@ //! [Logical Types]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md //! [object_store]: https://docs.rs/object_store/latest/object_store/ +#![warn(missing_docs)] /// Defines a an item with an experimental public API /// /// The module will not be documented, and will only be public if the @@ -117,7 +118,7 @@ pub mod basic; /// [parquet.thrift]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift // see parquet/CONTRIBUTING.md for instructions on regenerating // Don't try clippy and format auto generated code -#[allow(clippy::all)] +#[allow(clippy::all, missing_docs)] #[rustfmt::skip] pub mod format; diff --git a/parquet/src/record/api.rs b/parquet/src/record/api.rs index 85d96fd65275..7a2e268b30f1 100644 --- a/parquet/src/record/api.rs +++ b/parquet/src/record/api.rs @@ -101,6 +101,7 @@ impl Row { } } + /// Converts the row into a JSON object. #[cfg(any(feature = "json", test))] pub fn to_json_value(&self) -> Value { Value::Object( @@ -134,25 +135,45 @@ impl<'a> Iterator for RowColumnIter<'a> { /// Trait for type-safe convenient access to fields within a Row. pub trait RowAccessor { + /// Try to get a boolean value at the given index. fn get_bool(&self, i: usize) -> Result; + /// Try to get a byte value at the given index. fn get_byte(&self, i: usize) -> Result; + /// Try to get a short value at the given index. fn get_short(&self, i: usize) -> Result; + /// Try to get a int value at the given index. fn get_int(&self, i: usize) -> Result; + /// Try to get a long value at the given index. fn get_long(&self, i: usize) -> Result; + /// Try to get a ubyte value at the given index. fn get_ubyte(&self, i: usize) -> Result; + /// Try to get a ushort value at the given index. fn get_ushort(&self, i: usize) -> Result; + /// Try to get a uint value at the given index. fn get_uint(&self, i: usize) -> Result; + /// Try to get a ulong value at the given index. fn get_ulong(&self, i: usize) -> Result; + /// Try to get a float16 value at the given index. fn get_float16(&self, i: usize) -> Result; + /// Try to get a float value at the given index. fn get_float(&self, i: usize) -> Result; + /// Try to get a double value at the given index. fn get_double(&self, i: usize) -> Result; + /// Try to get a date value at the given index. fn get_timestamp_millis(&self, i: usize) -> Result; + /// Try to get a date value at the given index. fn get_timestamp_micros(&self, i: usize) -> Result; + /// Try to get a decimal value at the given index. fn get_decimal(&self, i: usize) -> Result<&Decimal>; + /// Try to get a string value at the given index. fn get_string(&self, i: usize) -> Result<&String>; + /// Try to get a bytes value at the given index. fn get_bytes(&self, i: usize) -> Result<&ByteArray>; + /// Try to get a group value at the given index. fn get_group(&self, i: usize) -> Result<&Row>; + /// Try to get a list value at the given index. fn get_list(&self, i: usize) -> Result<&List>; + /// Try to get a map value at the given index. fn get_map(&self, i: usize) -> Result<&Map>; } @@ -175,6 +196,7 @@ pub trait RowAccessor { /// ``` /// pub trait RowFormatter { + /// The method to format a field at the given index. fn fmt(&self, i: usize) -> &dyn fmt::Display; } @@ -295,6 +317,7 @@ impl List { self.elements.len() } + /// Get the reference to the elements in this list pub fn elements(&self) -> &[Field] { self.elements.as_slice() } @@ -309,25 +332,47 @@ pub fn make_list(elements: Vec) -> List { /// Trait for type-safe access of an index for a `List`. /// Note that the get_XXX methods do not do bound checking. pub trait ListAccessor { + /// Try getting a `boolean` value at the given index. fn get_bool(&self, i: usize) -> Result; + /// Try getting a `byte` value at the given index. fn get_byte(&self, i: usize) -> Result; + /// Try getting an `i16` value at the given index. fn get_short(&self, i: usize) -> Result; + /// Try getting an `i32` value at the given index. fn get_int(&self, i: usize) -> Result; + /// Try getting an `i64` value at the given index. fn get_long(&self, i: usize) -> Result; + /// Try getting a `u8` value at the given index. fn get_ubyte(&self, i: usize) -> Result; + /// Try getting a `u16` value at the given index. fn get_ushort(&self, i: usize) -> Result; + /// Try getting a `u32` value at the given index. fn get_uint(&self, i: usize) -> Result; + /// Try getting a `u64` value at the given index. fn get_ulong(&self, i: usize) -> Result; + /// Try getting a `f16` value at the given index. fn get_float16(&self, i: usize) -> Result; + /// Try getting a `f32` value at the given index. fn get_float(&self, i: usize) -> Result; + /// Try getting a `f64` value at the given index. fn get_double(&self, i: usize) -> Result; + /// Try getting a `timestamp` as milliseconds value + /// encoded as `i64` at the given index. fn get_timestamp_millis(&self, i: usize) -> Result; + /// Try getting a `timestamp` as microseconds value + /// encoded as `i64` at the given index. fn get_timestamp_micros(&self, i: usize) -> Result; + /// Try getting a `decimal` value at the given index. fn get_decimal(&self, i: usize) -> Result<&Decimal>; + /// Try getting a `string` value at the given index. fn get_string(&self, i: usize) -> Result<&String>; + /// Try getting a `bytes` value at the given index. fn get_bytes(&self, i: usize) -> Result<&ByteArray>; + /// Try getting a `group` value at the given index. fn get_group(&self, i: usize) -> Result<&Row>; + /// Try getting a `list` value at the given index. fn get_list(&self, i: usize) -> Result<&List>; + /// Try getting a `map` value at the given index. fn get_map(&self, i: usize) -> Result<&Map>; } @@ -420,6 +465,7 @@ impl Map { self.entries.len() } + /// Get the reference to the key-value pairs in this map pub fn entries(&self) -> &[(Field, Field)] { self.entries.as_slice() } @@ -433,7 +479,9 @@ pub fn make_map(entries: Vec<(Field, Field)>) -> Map { /// Trait for type-safe access of an index for a `Map` pub trait MapAccessor { + /// Get the keys of the map. fn get_keys<'a>(&'a self) -> Box; + /// Get the values of the map. fn get_values<'a>(&'a self) -> Box; } @@ -532,13 +580,13 @@ pub enum Field { Int(i32), /// Signed integer INT_64. Long(i64), - // Unsigned integer UINT_8. + /// Unsigned integer UINT_8. UByte(u8), - // Unsigned integer UINT_16. + /// Unsigned integer UINT_16. UShort(u16), - // Unsigned integer UINT_32. + /// Unsigned integer UINT_32. UInt(u32), - // Unsigned integer UINT_64. + /// Unsigned integer UINT_64. ULong(u64), /// IEEE 16-bit floating point value. Float16(f16), @@ -717,6 +765,7 @@ impl Field { Ok(field) } + /// Converts the Parquet field into a JSON [`Value`]. #[cfg(any(feature = "json", test))] pub fn to_json_value(&self) -> Value { use base64::prelude::BASE64_STANDARD; diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs index cc29658d918c..57469ee9c372 100644 --- a/parquet/src/record/reader.rs +++ b/parquet/src/record/reader.rs @@ -304,18 +304,18 @@ impl TreeBuilder { /// Reader tree for record assembly pub enum Reader { - // Primitive reader with type information and triplet iterator + /// Primitive reader with type information and triplet iterator PrimitiveReader(TypePtr, Box), - // Optional reader with definition level of a parent and a reader + /// Optional reader with definition level of a parent and a reader OptionReader(i16, Box), - // Group (struct) reader with type information, definition level and list of child - // readers. When it represents message type, type information is None + /// Group (struct) reader with type information, definition level and list of child + /// readers. When it represents message type, type information is None GroupReader(Option, i16, Vec), - // Reader for repeated values, e.g. lists, contains type information, definition - // level, repetition level and a child reader + /// Reader for repeated values, e.g. lists, contains type information, definition + /// level, repetition level and a child reader RepeatedReader(TypePtr, i16, i16, Box), - // Reader of key-value pairs, e.g. maps, contains type information, definition - // level, repetition level, child reader for keys and child reader for values + /// Reader of key-value pairs, e.g. maps, contains type information, definition + /// level, repetition level, child reader for keys and child reader for values KeyValueReader(TypePtr, i16, i16, Box, Box), } diff --git a/parquet/src/record/record_reader.rs b/parquet/src/record/record_reader.rs index cfaf14a3d6f8..75ca4e3e3976 100644 --- a/parquet/src/record/record_reader.rs +++ b/parquet/src/record/record_reader.rs @@ -18,11 +18,12 @@ use super::super::errors::ParquetError; use super::super::file::reader::RowGroupReader; -/// Read up to `max_records` records from `row_group_reader` into `self`. +/// Read up to `num_records` records from `row_group_reader` into `self`. /// /// The type parameter `T` is used to work around the rust orphan rule /// when implementing on types such as `Vec`. pub trait RecordReader { + /// Read up to `num_records` records from `row_group_reader` into `self`. fn read_from_row_group( &mut self, row_group_reader: &mut dyn RowGroupReader, diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index 2665f28fed54..39d2fa28c627 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -45,15 +45,24 @@ pub type ColumnDescPtr = Arc; /// repetition is `None`. #[derive(Clone, Debug, PartialEq)] pub enum Type { + /// Represents a primitive leaf field. PrimitiveType { + /// Basic information about the type. basic_info: BasicTypeInfo, + /// Physical type of this primitive type. physical_type: PhysicalType, + /// Length of this type. type_length: i32, + /// Scale of this type. scale: i32, + /// Precision of this type. precision: i32, }, + /// Represents a group of fields (similar to struct). GroupType { + /// Basic information about the type. basic_info: BasicTypeInfo, + /// Fields of this group type. fields: Vec, }, } @@ -745,6 +754,7 @@ impl ColumnPath { self.parts.append(&mut tail); } + /// Returns a slice of path components. pub fn parts(&self) -> &[String] { &self.parts } @@ -1033,6 +1043,7 @@ impl SchemaDescriptor { self.schema.as_ref() } + /// Returns schema as [`TypePtr`] for cheap cloning. pub fn root_schema_ptr(&self) -> TypePtr { self.schema.clone() } diff --git a/parquet/src/schema/visitor.rs b/parquet/src/schema/visitor.rs index 35fde11f1fbb..7a10d3a5ffd6 100644 --- a/parquet/src/schema/visitor.rs +++ b/parquet/src/schema/visitor.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! Utilities to traverse against various parquet type. + use crate::basic::{ConvertedType, Repetition}; use crate::errors::ParquetError::General; use crate::errors::Result; diff --git a/parquet/src/thrift.rs b/parquet/src/thrift.rs index abb2ac13c4ed..5be025f9540f 100644 --- a/parquet/src/thrift.rs +++ b/parquet/src/thrift.rs @@ -27,7 +27,9 @@ use thrift::protocol::{ /// /// Unlike [`thrift::protocol::TSerializable`] this uses generics instead of trait objects pub trait TSerializable: Sized { + /// Reads the struct from the input Thrift protocol fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result; + /// Writes the struct to the output Thrift protocol fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()>; }