diff --git a/Cargo.toml b/Cargo.toml index d59a5af68a19..3267bf47dfa4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,6 +32,8 @@ members = [ "arrow-integration-testing", "arrow-ipc", "arrow-json", + "arrow-orc", + "arrow-orc/orc-gen", "arrow-ord", "arrow-row", "arrow-schema", @@ -86,6 +88,7 @@ arrow-csv = { version = "48.0.0", path = "./arrow-csv" } arrow-data = { version = "48.0.0", path = "./arrow-data" } arrow-ipc = { version = "48.0.0", path = "./arrow-ipc" } arrow-json = { version = "48.0.0", path = "./arrow-json" } +arrow-orc = { version = "48.0.0", path = "./arrow-orc" } arrow-ord = { version = "48.0.0", path = "./arrow-ord" } arrow-row = { version = "48.0.0", path = "./arrow-row" } arrow-schema = { version = "48.0.0", path = "./arrow-schema" } diff --git a/arrow-orc/Cargo.toml b/arrow-orc/Cargo.toml new file mode 100644 index 000000000000..aff336de92f1 --- /dev/null +++ b/arrow-orc/Cargo.toml @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "arrow-orc" +version = { workspace = true } +description = "Support for parsing ORC format into the Arrow format" +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +keywords = { workspace = true } +include = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } + +[lib] +name = "arrow_orc" +path = "src/lib.rs" +bench = false + +[dependencies] +arrow-array = { workspace = true } +arrow-buffer = { workspace = true } +arrow-cast = { workspace = true } +arrow-data = { workspace = true } +arrow-schema = { workspace = true } + +bytes = { version = "1", default-features = false, features = ["std"] } +snap = { version = "1.1", default-features = false } +flate2 = { version = "1.0", default-features = false, features = ["rust_backend"] } +lz4_flex = { version = "0.11", default-features = false, features = ["std"] } +zstd = { version = "0.12", default-features = false } +lzokay-native = "0.1" + +prost = "0.12.1" + diff --git a/arrow-orc/orc-gen/Cargo.toml b/arrow-orc/orc-gen/Cargo.toml new file mode 100644 index 000000000000..a0a6927416fe --- /dev/null +++ b/arrow-orc/orc-gen/Cargo.toml @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "orc-gen" +description = "Code generation for arrow-orc" +version = "0.1.0" +edition = { workspace = true } +rust-version = { workspace = true } +authors = { workspace = true } +homepage = { workspace = true } +repository = { workspace = true } +license = { workspace = true } +publish = false + + +[dependencies] +prost-build = { version = "=0.12.1", default-features = false } diff --git a/arrow-orc/orc-gen/src/main.rs b/arrow-orc/orc-gen/src/main.rs new file mode 100644 index 000000000000..3a960e8f245e --- /dev/null +++ b/arrow-orc/orc-gen/src/main.rs @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + fs::{remove_file, OpenOptions}, + io::{Read, Write}, +}; + +fn main() -> Result<(), Box> { + prost_build::Config::new() + .out_dir("src/") + .compile_well_known_types() + .extern_path(".google.protobuf", "::pbjson_types") + .compile_protos(&["../format/orc_proto.proto"], &["../format"])?; + + // read file contents to string + let mut file = OpenOptions::new().read(true).open("src/orc.proto.rs")?; + let mut buffer = String::new(); + file.read_to_string(&mut buffer)?; + // append warning that file was auto-generate + let mut file = OpenOptions::new() + .write(true) + .truncate(true) + .create(true) + .open("src/proto.rs")?; + file.write_all("// This file was automatically generated through the regen.sh script, and should not be edited.\n\n".as_bytes())?; + file.write_all(buffer.as_bytes())?; + + // since we renamed file to proto.rs to avoid period in the name + remove_file("src/orc.proto.rs")?; + + // As the proto file is checked in, the build should not fail if the file is not found + Ok(()) +} diff --git a/arrow-orc/regen.sh b/arrow-orc/regen.sh new file mode 100755 index 000000000000..f543928ed76c --- /dev/null +++ b/arrow-orc/regen.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +cd $SCRIPT_DIR && cargo run --manifest-path orc-gen/Cargo.toml diff --git a/arrow-orc/src/array_reader/mod.rs b/arrow-orc/src/array_reader/mod.rs new file mode 100644 index 000000000000..74ab8b69173c --- /dev/null +++ b/arrow-orc/src/array_reader/mod.rs @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Read ORC file columns as Arrow arrays. + +use arrow_array::ArrayRef; + +use crate::errors::Result; + +pub mod struct_array_reader; + +/// Used to be able to read batches of data from columns into Arrow arrays. +pub trait ArrayReader { + fn next_batch(&mut self, batch_size: usize) -> Result>; +} diff --git a/arrow-orc/src/array_reader/struct_array_reader.rs b/arrow-orc/src/array_reader/struct_array_reader.rs new file mode 100644 index 000000000000..53dd1c462844 --- /dev/null +++ b/arrow-orc/src/array_reader/struct_array_reader.rs @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Read Struct Arrays from ORC file column + +use std::sync::Arc; + +use arrow_array::{Array, ArrayRef, StructArray}; +use arrow_data::ArrayDataBuilder; +use arrow_schema::DataType; + +use crate::errors::Result; + +use super::ArrayReader; + +pub struct StructArrayReader { + children: Vec>, + data_type: DataType, +} + +impl StructArrayReader { + pub fn new(children: Vec>, data_type: DataType) -> Self { + Self { + children, + data_type, + } + } + + // For convenience when reading root of ORC file (expect Struct as root type) + pub fn next_struct_array_batch( + &mut self, + batch_size: usize, + ) -> Result>> { + if self.children.is_empty() { + return Ok(None); + } + + let children_arrays = self + .children + .iter_mut() + .map(|reader| reader.next_batch(batch_size)) + .collect::>>()?; + let expected_length = children_arrays + .first() + .and_then(|a| a.as_ref().map(Array::len)); + let all_child_len_match = children_arrays + .iter() + .all(|array| array.as_ref().map(Array::len) == expected_length); + if !all_child_len_match { + return Err(general_err!( + "Struct array reader has children with mismatched lengths" + )); + } + + match expected_length { + None => Ok(None), + Some(length) => { + // TODO: account for nullability? + let array_data = ArrayDataBuilder::new(self.data_type.clone()) + .len(length) + .child_data( + children_arrays + .iter() + .flatten() + .map(Array::to_data) + .collect::>(), + ); + let array_data = array_data.build()?; + Ok(Some(Arc::new(StructArray::from(array_data)))) + } + } + } +} + +impl ArrayReader for StructArrayReader { + fn next_batch(&mut self, batch_size: usize) -> Result> { + self.next_struct_array_batch(batch_size) + .map(|opt| opt.map(|sa| sa as ArrayRef)) + } +} diff --git a/arrow-orc/src/decompress.rs b/arrow-orc/src/decompress.rs new file mode 100644 index 000000000000..b0f8688e0743 --- /dev/null +++ b/arrow-orc/src/decompress.rs @@ -0,0 +1,200 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Handle generic decompression of ORC files. + +use std::io::Read; + +use crate::errors::{OrcError, Result}; +use crate::proto; + +/// Supported generic compression types. +/// Compression block size indicates maximum size of each compression chunk. +/// No chunk will decompress to larger than thus block size. +// TODO: use compression block size for other variants too +#[derive(Debug, PartialEq, Eq, Copy, Clone)] +pub enum CompressionType { + Lz4 { compression_block_size: u64 }, + Lzo, + Snappy, + Zlib, + Zstd, +} + +impl CompressionType { + pub fn from_proto( + value: proto::CompressionKind, + compression_block_size: Option, + ) -> Result> { + let ct = match (value, compression_block_size) { + (proto::CompressionKind::None, None) => None, + (proto::CompressionKind::Zlib, Some(_size)) => Some(CompressionType::Zlib), + (proto::CompressionKind::Snappy, Some(_size)) => Some(CompressionType::Snappy), + (proto::CompressionKind::Lzo, Some(_size)) => Some(CompressionType::Lzo), + (proto::CompressionKind::Lz4, Some(compression_block_size)) => { + Some(CompressionType::Lz4 { + compression_block_size, + }) + } + (proto::CompressionKind::Zstd, Some(_size)) => Some(CompressionType::Zstd), + _ => { + return Err(OrcError::Corrupted( + "Invalid compression settings".to_string(), + )) + } + }; + Ok(ct) + } +} + +/// ORC files are compressed in blocks, with a 3 byte header at the start +/// of these blocks indicating the length of the block and whether it's +/// compressed or not. +fn decode_header(bytes: [u8; 3]) -> CompressionHeader { + let bytes = [bytes[0], bytes[1], bytes[2], 0]; + let length = u32::from_le_bytes(bytes); + let is_original = length & 1 == 1; + // to clear the is_original bit + let length = length >> 1; + if is_original { + CompressionHeader::Original(length) + } else { + CompressionHeader::Compressed(length) + } +} + +/// Indicates length of block and whether it's compressed or not. +#[derive(Debug, PartialEq, Eq)] +enum CompressionHeader { + Original(u32), + Compressed(u32), +} + +/// Use to decompress a reader of bytes, according to ORC specification: +/// +/// - Bytes are grouped into blocks +/// - Each block has a 3 byte header, indicating length of block and if +/// the block is compressed or the uncompressed original bytes +pub struct Decompressor { + reader: R, + decompressed_block: Vec, + block_start_index: usize, + compression_type: CompressionType, +} + +impl Decompressor { + pub fn new(reader: R, compression_type: CompressionType) -> Self { + Self { + reader, + decompressed_block: vec![], + block_start_index: 0, + compression_type, + } + } + + fn process_compressed_block(&mut self, compressed_block: &[u8]) -> Result<()> { + self.decompressed_block.clear(); + match self.compression_type { + CompressionType::Lzo => { + let decompressed = lzokay_native::decompress_all(compressed_block, None)?; + self.decompressed_block.extend(decompressed); + } + CompressionType::Lz4 { + compression_block_size, + } => { + let decompressed = + lz4_flex::block::decompress(compressed_block, compression_block_size as usize)?; + self.decompressed_block.extend(decompressed); + } + CompressionType::Snappy => { + let len = snap::raw::decompress_len(compressed_block)?; + self.decompressed_block.resize(len, 0); + let mut decoder = snap::raw::Decoder::new(); + decoder.decompress(compressed_block, &mut self.decompressed_block)?; + } + CompressionType::Zlib => { + let mut reader = flate2::read::DeflateDecoder::new(compressed_block); + reader.read_to_end(&mut self.decompressed_block)?; + } + CompressionType::Zstd => { + zstd::stream::copy_decode(compressed_block, &mut self.decompressed_block)?; + } + }; + Ok(()) + } +} + +impl Read for Decompressor { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + if buf.is_empty() { + return Ok(0); + } + + // if finished copying from decompressed_block + // grab next block + if self.block_start_index >= self.decompressed_block.len() { + let mut header = [0; 3]; + let size = self.reader.read(&mut header[..1])?; + if size == 0 { + // exhausted + return Ok(0); + } + // otherwise get other header bytes + self.reader.read_exact(&mut header[1..])?; + + match decode_header(header) { + CompressionHeader::Original(len) => { + self.decompressed_block.resize(len as usize, 0); + self.reader.read_exact(&mut self.decompressed_block)?; + } + CompressionHeader::Compressed(len) => { + let mut compressed = vec![0; len as usize]; + self.reader.read_exact(&mut compressed)?; + self.process_compressed_block(&compressed)?; + } + }; + self.block_start_index = 0; + } + + // copy out the decompressed bytes + let bytes_written = buf + .len() + .min(self.decompressed_block.len() - self.block_start_index); + let end = self.block_start_index + bytes_written; + buf[..bytes_written].copy_from_slice(&self.decompressed_block[self.block_start_index..end]); + self.block_start_index = end; + Ok(bytes_written) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_decode_header() { + let header = [0x40, 0x0d, 0x03]; + let actual = decode_header(header); + let expected = CompressionHeader::Compressed(100_000); + assert_eq!(expected, actual); + + let header = [0x0b, 0x00, 0x00]; + let actual = decode_header(header); + let expected = CompressionHeader::Original(5); + assert_eq!(expected, actual); + } +} diff --git a/arrow-orc/src/errors.rs b/arrow-orc/src/errors.rs new file mode 100644 index 000000000000..e82afdaf3fcc --- /dev/null +++ b/arrow-orc/src/errors.rs @@ -0,0 +1,137 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! ORC errors and related utility macros. + +use arrow_schema::ArrowError; +use prost::DecodeError; +use std::error::Error; +use std::{io, result, str}; + +// TODO: more specific errors +#[derive(Debug)] +pub enum OrcError { + /// Generic error + General(String), + /// When couldn't convert to/from Arrow schema + SchemaConversion(String), + /// When file doesn't conform to expected spec + Corrupted(String), + /// Functionality not yet implemented + NotYetImplemented(String), + /// External error + External(Box), +} + +impl std::fmt::Display for OrcError { + fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { + match &self { + Self::General(m) => write!(fmt, "ORC error: {m}"), + Self::SchemaConversion(m) => write!(fmt, "ORC schema error: {m}"), + Self::Corrupted(m) => write!(fmt, "ORC file out of specification: {m}"), + Self::NotYetImplemented(m) => { + write!(fmt, "ORC feature not yet implemented: {m}") + } + Self::External(m) => write!(fmt, "External: {m}"), + } + } +} + +impl Error for OrcError { + fn source(&self) -> Option<&(dyn Error + 'static)> { + match self { + Self::External(e) => Some(e.as_ref()), + _ => None, + } + } +} + +impl From for OrcError { + fn from(e: io::Error) -> Self { + Self::External(Box::new(e)) + } +} + +impl From for OrcError { + fn from(e: snap::Error) -> Self { + Self::External(Box::new(e)) + } +} + +impl From for OrcError { + fn from(e: lzokay_native::Error) -> Self { + Self::External(Box::new(e)) + } +} + +impl From for OrcError { + fn from(e: str::Utf8Error) -> Self { + Self::External(Box::new(e)) + } +} + +impl From for OrcError { + fn from(e: lz4_flex::block::DecompressError) -> Self { + Self::External(Box::new(e)) + } +} + +impl From for OrcError { + fn from(e: ArrowError) -> Self { + Self::External(Box::new(e)) + } +} + +impl From for OrcError { + fn from(e: DecodeError) -> Self { + Self::External(Box::new(e)) + } +} + +/// A specialized `Result` for ORC errors. +pub type Result = result::Result; + +// ---------------------------------------------------------------------- +// Convenient macros for different errors + +macro_rules! general_err { + ($fmt:expr) => (crate::errors::OrcError::General($fmt.to_owned())); + ($fmt:expr, $($args:expr),*) => (crate::errors::OrcError::General(format!($fmt, $($args),*))); + ($e:expr, $fmt:expr) => (crate::errors::OrcError::General($fmt.to_owned(), $e)); + ($e:ident, $fmt:expr, $($args:tt),*) => ( + crate::errors::OrcError::General(&format!($fmt, $($args),*), $e)); +} + +macro_rules! nyi_err { + ($fmt:expr) => (crate::errors::OrcError::NotYetImplemented($fmt.to_owned())); + ($fmt:expr, $($args:expr),*) => (crate::errors::OrcError::NotYetImplemented(format!($fmt, $($args),*))); +} + +// ---------------------------------------------------------------------- +// Convert ORC error into other errors + +impl From for ArrowError { + fn from(p: OrcError) -> Self { + Self::OrcError(format!("{p}")) + } +} + +impl From for std::io::Error { + fn from(value: OrcError) -> Self { + std::io::Error::new(std::io::ErrorKind::Other, value) + } +} diff --git a/arrow-orc/src/file_metadata.rs b/arrow-orc/src/file_metadata.rs new file mode 100644 index 000000000000..8ea66d808497 --- /dev/null +++ b/arrow-orc/src/file_metadata.rs @@ -0,0 +1,174 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Reading ORC file metadata. + +use arrow_schema::SchemaRef; +use prost::Message; + +use crate::decompress::{CompressionType, Decompressor}; +use crate::errors::Result; +use crate::proto; +use crate::reader::Reader; +use crate::schema::to_root_schema; + +use std::borrow::Cow; +use std::io::Read; + +/// How many bytes to read in first read from file. +/// Ideally will contain postscript, footer and metadata sections. +const FIRST_READ_BYTES_SIZE: u64 = 16 * 1024; + +/// Given a reader over an ORC file, will seek to the end and read the metadata +/// located at the tail of the file. +pub fn parse_metadata(reader: &mut R) -> Result { + let reader_length = reader.len(); + if reader_length == 0 { + return Err(general_err!("Cannot read metadata from empty file")); + } + // in case file is smaller than expected + let first_chunk_size = FIRST_READ_BYTES_SIZE.min(reader_length); + + let offset = reader_length - first_chunk_size; + let bytes = reader.get_bytes(offset, first_chunk_size)?; + + // safe split since bytes isn't empty + // postscript length is encoded as single last byte in file + let (bytes, postscript_length_byte) = bytes.split_at(bytes.len() - 1); + let postscript_length = postscript_length_byte[0] as usize; + + // if file is too small for stated postscript section length + if postscript_length > bytes.len() { + return Err(general_err!("Invalid postscript length")); + } + // safe split as here we're guaranteed we have enough bytes for the postscript + let (bytes, postscript_bytes) = bytes.split_at(bytes.len() - postscript_length); + let postscript = proto::PostScript::decode(postscript_bytes)?; + + let compression_type = + CompressionType::from_proto(postscript.compression(), postscript.compression_block_size)?; + let footer_length = postscript.footer_length(); + let metadata_length = postscript.metadata_length(); + + let bytes_len = bytes.len() as u64; + let bytes = if (footer_length + metadata_length) > bytes_len { + // need to read more bytes as footer + metadata size exceeds initial read chunk + let bytes_to_read = (footer_length + metadata_length) - bytes_len; + let offset = reader_length - first_chunk_size - bytes_to_read; + + let mut extra_bytes = reader.get_bytes(offset, bytes_to_read)?; + extra_bytes.extend_from_slice(bytes); + Cow::Owned(extra_bytes) + } else { + Cow::Borrowed(bytes) + }; + + // here on we are guaranteed enough bytes for whatever we need + let (bytes, footer_bytes) = bytes.split_at(bytes.len() - footer_length as usize); + // footer and metadata may be optionally compressed + // if compression was set in postscript + let footer = match compression_type { + Some(compression) => { + let mut bytes = vec![]; + Decompressor::new(footer_bytes, compression).read_to_end(&mut bytes)?; + proto::Footer::decode(bytes.as_ref())? + } + None => proto::Footer::decode(footer_bytes)?, + }; + + let (_, metadata_bytes) = bytes.split_at(bytes.len() - metadata_length as usize); + // TODO: make use of metadata for statistics + let _metadata = match compression_type { + Some(compression) => { + let mut bytes = vec![]; + Decompressor::new(metadata_bytes, compression).read_to_end(&mut bytes)?; + proto::Metadata::decode(bytes.as_ref())? + } + None => proto::Metadata::decode(metadata_bytes)?, + }; + + let schema = to_root_schema(&footer.types)?; + let number_of_rows = footer.number_of_rows(); + let stripes = footer + .stripes + .into_iter() + .map(StripeInformation::from) + .collect::>(); + + Ok(OrcMetadata { + compression_type, + stripes, + schema, + number_of_rows, + }) +} + +/// Contains general metadata about entire ORC file. +#[derive(Debug)] +pub struct OrcMetadata { + /// If ORC file has compression enabled or not + pub compression_type: Option, + /// Information used for decoding each stripe + pub stripes: Vec, + /// Converted Arrow schema for entire file + pub schema: SchemaRef, + /// Total number of rows in the file + pub number_of_rows: u64, +} + +/// Contains information used to locate stripes and their sections +/// in the file. +#[derive(Debug, Copy, Clone)] +pub struct StripeInformation { + pub start_offset: u64, + pub index_length: u64, + pub data_length: u64, + pub footer_length: u64, + pub number_of_rows: u64, +} + +impl From for StripeInformation { + fn from(value: proto::StripeInformation) -> Self { + Self { + start_offset: value.offset(), + index_length: value.index_length(), + data_length: value.data_length(), + footer_length: value.footer_length(), + number_of_rows: value.number_of_rows(), + } + } +} + +#[cfg(test)] +mod tests { + use std::fs::File; + + use super::*; + + #[test] + fn test_parse_metadata() -> Result<()> { + let file_name = "demo-12-zlib.orc"; + let mut file = File::open(format!("tests/data/{file_name}"))?; + let _ = parse_metadata(&mut file)?; + + let file_name = "alltypes.none.orc"; + let mut file = File::open(format!("tests/data/{file_name}"))?; + let _ = parse_metadata(&mut file)?; + + Ok(()) + } +} diff --git a/arrow-orc/src/lib.rs b/arrow-orc/src/lib.rs new file mode 100644 index 000000000000..ecf80adf4d80 --- /dev/null +++ b/arrow-orc/src/lib.rs @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! This crate contains the official Native Rust implementation of +//! [Apache ORC](https://orc.apache.org/), part of the +//! [Apache Arrow](https://arrow.apache.org/) project. +//! +//! # Getting Started +//! See [sync_reader] for synchronously reading ORC files to Arrow +//! [`RecordBatch`]es. +//! +//! [`RecordBatch`]: arrow_array::RecordBatch + +#[macro_use] +pub mod errors; +pub mod sync_reader; + +mod array_reader; +mod decompress; +mod file_metadata; +mod proto; +mod reader; +mod schema; diff --git a/arrow-orc/src/proto.rs b/arrow-orc/src/proto.rs new file mode 100644 index 000000000000..07545a68f2fa --- /dev/null +++ b/arrow-orc/src/proto.rs @@ -0,0 +1,812 @@ +// This file was automatically generated through the regen.sh script, and should not be edited. + +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct IntegerStatistics { + #[prost(sint64, optional, tag = "1")] + pub minimum: ::core::option::Option, + #[prost(sint64, optional, tag = "2")] + pub maximum: ::core::option::Option, + #[prost(sint64, optional, tag = "3")] + pub sum: ::core::option::Option, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct DoubleStatistics { + #[prost(double, optional, tag = "1")] + pub minimum: ::core::option::Option, + #[prost(double, optional, tag = "2")] + pub maximum: ::core::option::Option, + #[prost(double, optional, tag = "3")] + pub sum: ::core::option::Option, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct StringStatistics { + #[prost(string, optional, tag = "1")] + pub minimum: ::core::option::Option<::prost::alloc::string::String>, + #[prost(string, optional, tag = "2")] + pub maximum: ::core::option::Option<::prost::alloc::string::String>, + /// sum will store the total length of all strings in a stripe + #[prost(sint64, optional, tag = "3")] + pub sum: ::core::option::Option, + /// If the minimum or maximum value was longer than 1024 bytes, store a lower or upper + /// bound instead of the minimum or maximum values above. + #[prost(string, optional, tag = "4")] + pub lower_bound: ::core::option::Option<::prost::alloc::string::String>, + #[prost(string, optional, tag = "5")] + pub upper_bound: ::core::option::Option<::prost::alloc::string::String>, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct BucketStatistics { + #[prost(uint64, repeated, tag = "1")] + pub count: ::prost::alloc::vec::Vec, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct DecimalStatistics { + #[prost(string, optional, tag = "1")] + pub minimum: ::core::option::Option<::prost::alloc::string::String>, + #[prost(string, optional, tag = "2")] + pub maximum: ::core::option::Option<::prost::alloc::string::String>, + #[prost(string, optional, tag = "3")] + pub sum: ::core::option::Option<::prost::alloc::string::String>, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct DateStatistics { + /// min,max values saved as days since epoch + #[prost(sint32, optional, tag = "1")] + pub minimum: ::core::option::Option, + #[prost(sint32, optional, tag = "2")] + pub maximum: ::core::option::Option, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct TimestampStatistics { + /// min,max values saved as milliseconds since epoch + #[prost(sint64, optional, tag = "1")] + pub minimum: ::core::option::Option, + #[prost(sint64, optional, tag = "2")] + pub maximum: ::core::option::Option, + #[prost(sint64, optional, tag = "3")] + pub minimum_utc: ::core::option::Option, + #[prost(sint64, optional, tag = "4")] + pub maximum_utc: ::core::option::Option, + /// store the lower 6 TS digits for min/max to achieve nanosecond precision + #[prost(int32, optional, tag = "5")] + pub minimum_nanos: ::core::option::Option, + #[prost(int32, optional, tag = "6")] + pub maximum_nanos: ::core::option::Option, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct BinaryStatistics { + /// sum will store the total binary blob length in a stripe + #[prost(sint64, optional, tag = "1")] + pub sum: ::core::option::Option, +} +/// Statistics for list and map +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct CollectionStatistics { + #[prost(uint64, optional, tag = "1")] + pub min_children: ::core::option::Option, + #[prost(uint64, optional, tag = "2")] + pub max_children: ::core::option::Option, + #[prost(uint64, optional, tag = "3")] + pub total_children: ::core::option::Option, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ColumnStatistics { + #[prost(uint64, optional, tag = "1")] + pub number_of_values: ::core::option::Option, + #[prost(message, optional, tag = "2")] + pub int_statistics: ::core::option::Option, + #[prost(message, optional, tag = "3")] + pub double_statistics: ::core::option::Option, + #[prost(message, optional, tag = "4")] + pub string_statistics: ::core::option::Option, + #[prost(message, optional, tag = "5")] + pub bucket_statistics: ::core::option::Option, + #[prost(message, optional, tag = "6")] + pub decimal_statistics: ::core::option::Option, + #[prost(message, optional, tag = "7")] + pub date_statistics: ::core::option::Option, + #[prost(message, optional, tag = "8")] + pub binary_statistics: ::core::option::Option, + #[prost(message, optional, tag = "9")] + pub timestamp_statistics: ::core::option::Option, + #[prost(bool, optional, tag = "10")] + pub has_null: ::core::option::Option, + #[prost(uint64, optional, tag = "11")] + pub bytes_on_disk: ::core::option::Option, + #[prost(message, optional, tag = "12")] + pub collection_statistics: ::core::option::Option, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct RowIndexEntry { + #[prost(uint64, repeated, tag = "1")] + pub positions: ::prost::alloc::vec::Vec, + #[prost(message, optional, tag = "2")] + pub statistics: ::core::option::Option, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct RowIndex { + #[prost(message, repeated, tag = "1")] + pub entry: ::prost::alloc::vec::Vec, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct BloomFilter { + #[prost(uint32, optional, tag = "1")] + pub num_hash_functions: ::core::option::Option, + #[prost(fixed64, repeated, packed = "false", tag = "2")] + pub bitset: ::prost::alloc::vec::Vec, + #[prost(bytes = "vec", optional, tag = "3")] + pub utf8bitset: ::core::option::Option<::prost::alloc::vec::Vec>, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct BloomFilterIndex { + #[prost(message, repeated, tag = "1")] + pub bloom_filter: ::prost::alloc::vec::Vec, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct Stream { + #[prost(enumeration = "stream::Kind", optional, tag = "1")] + pub kind: ::core::option::Option, + #[prost(uint32, optional, tag = "2")] + pub column: ::core::option::Option, + #[prost(uint64, optional, tag = "3")] + pub length: ::core::option::Option, +} +/// Nested message and enum types in `Stream`. +pub mod stream { + /// if you add new index stream kinds, you need to make sure to update + /// StreamName to ensure it is added to the stripe in the right area + #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] + #[repr(i32)] + pub enum Kind { + Present = 0, + Data = 1, + Length = 2, + DictionaryData = 3, + DictionaryCount = 4, + Secondary = 5, + RowIndex = 6, + BloomFilter = 7, + BloomFilterUtf8 = 8, + /// Virtual stream kinds to allocate space for encrypted index and data. + EncryptedIndex = 9, + EncryptedData = 10, + /// stripe statistics streams + StripeStatistics = 100, + /// A virtual stream kind that is used for setting the encryption IV. + FileStatistics = 101, + } + impl Kind { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + Kind::Present => "PRESENT", + Kind::Data => "DATA", + Kind::Length => "LENGTH", + Kind::DictionaryData => "DICTIONARY_DATA", + Kind::DictionaryCount => "DICTIONARY_COUNT", + Kind::Secondary => "SECONDARY", + Kind::RowIndex => "ROW_INDEX", + Kind::BloomFilter => "BLOOM_FILTER", + Kind::BloomFilterUtf8 => "BLOOM_FILTER_UTF8", + Kind::EncryptedIndex => "ENCRYPTED_INDEX", + Kind::EncryptedData => "ENCRYPTED_DATA", + Kind::StripeStatistics => "STRIPE_STATISTICS", + Kind::FileStatistics => "FILE_STATISTICS", + } + } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "PRESENT" => Some(Self::Present), + "DATA" => Some(Self::Data), + "LENGTH" => Some(Self::Length), + "DICTIONARY_DATA" => Some(Self::DictionaryData), + "DICTIONARY_COUNT" => Some(Self::DictionaryCount), + "SECONDARY" => Some(Self::Secondary), + "ROW_INDEX" => Some(Self::RowIndex), + "BLOOM_FILTER" => Some(Self::BloomFilter), + "BLOOM_FILTER_UTF8" => Some(Self::BloomFilterUtf8), + "ENCRYPTED_INDEX" => Some(Self::EncryptedIndex), + "ENCRYPTED_DATA" => Some(Self::EncryptedData), + "STRIPE_STATISTICS" => Some(Self::StripeStatistics), + "FILE_STATISTICS" => Some(Self::FileStatistics), + _ => None, + } + } + } +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ColumnEncoding { + #[prost(enumeration = "column_encoding::Kind", optional, tag = "1")] + pub kind: ::core::option::Option, + #[prost(uint32, optional, tag = "2")] + pub dictionary_size: ::core::option::Option, + /// The encoding of the bloom filters for this column: + /// 0 or missing = none or original + /// 1 = ORC-135 (utc for timestamps) + #[prost(uint32, optional, tag = "3")] + pub bloom_encoding: ::core::option::Option, +} +/// Nested message and enum types in `ColumnEncoding`. +pub mod column_encoding { + #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] + #[repr(i32)] + pub enum Kind { + Direct = 0, + Dictionary = 1, + DirectV2 = 2, + DictionaryV2 = 3, + } + impl Kind { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + Kind::Direct => "DIRECT", + Kind::Dictionary => "DICTIONARY", + Kind::DirectV2 => "DIRECT_V2", + Kind::DictionaryV2 => "DICTIONARY_V2", + } + } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "DIRECT" => Some(Self::Direct), + "DICTIONARY" => Some(Self::Dictionary), + "DIRECT_V2" => Some(Self::DirectV2), + "DICTIONARY_V2" => Some(Self::DictionaryV2), + _ => None, + } + } + } +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct StripeEncryptionVariant { + #[prost(message, repeated, tag = "1")] + pub streams: ::prost::alloc::vec::Vec, + #[prost(message, repeated, tag = "2")] + pub encoding: ::prost::alloc::vec::Vec, +} +// each stripe looks like: +// index streams +// unencrypted +// variant 1..N +// data streams +// unencrypted +// variant 1..N +// footer + +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct StripeFooter { + #[prost(message, repeated, tag = "1")] + pub streams: ::prost::alloc::vec::Vec, + #[prost(message, repeated, tag = "2")] + pub columns: ::prost::alloc::vec::Vec, + #[prost(string, optional, tag = "3")] + pub writer_timezone: ::core::option::Option<::prost::alloc::string::String>, + /// one for each column encryption variant + #[prost(message, repeated, tag = "4")] + pub encryption: ::prost::alloc::vec::Vec, +} +// the file tail looks like: +// encrypted stripe statistics: ColumnarStripeStatistics (order by variant) +// stripe statistics: Metadata +// footer: Footer +// postscript: PostScript +// psLen: byte + +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct StringPair { + #[prost(string, optional, tag = "1")] + pub key: ::core::option::Option<::prost::alloc::string::String>, + #[prost(string, optional, tag = "2")] + pub value: ::core::option::Option<::prost::alloc::string::String>, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct Type { + #[prost(enumeration = "r#type::Kind", optional, tag = "1")] + pub kind: ::core::option::Option, + #[prost(uint32, repeated, tag = "2")] + pub subtypes: ::prost::alloc::vec::Vec, + #[prost(string, repeated, tag = "3")] + pub field_names: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, + #[prost(uint32, optional, tag = "4")] + pub maximum_length: ::core::option::Option, + #[prost(uint32, optional, tag = "5")] + pub precision: ::core::option::Option, + #[prost(uint32, optional, tag = "6")] + pub scale: ::core::option::Option, + #[prost(message, repeated, tag = "7")] + pub attributes: ::prost::alloc::vec::Vec, +} +/// Nested message and enum types in `Type`. +pub mod r#type { + #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] + #[repr(i32)] + pub enum Kind { + Boolean = 0, + Byte = 1, + Short = 2, + Int = 3, + Long = 4, + Float = 5, + Double = 6, + String = 7, + Binary = 8, + Timestamp = 9, + List = 10, + Map = 11, + Struct = 12, + Union = 13, + Decimal = 14, + Date = 15, + Varchar = 16, + Char = 17, + TimestampInstant = 18, + } + impl Kind { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + Kind::Boolean => "BOOLEAN", + Kind::Byte => "BYTE", + Kind::Short => "SHORT", + Kind::Int => "INT", + Kind::Long => "LONG", + Kind::Float => "FLOAT", + Kind::Double => "DOUBLE", + Kind::String => "STRING", + Kind::Binary => "BINARY", + Kind::Timestamp => "TIMESTAMP", + Kind::List => "LIST", + Kind::Map => "MAP", + Kind::Struct => "STRUCT", + Kind::Union => "UNION", + Kind::Decimal => "DECIMAL", + Kind::Date => "DATE", + Kind::Varchar => "VARCHAR", + Kind::Char => "CHAR", + Kind::TimestampInstant => "TIMESTAMP_INSTANT", + } + } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "BOOLEAN" => Some(Self::Boolean), + "BYTE" => Some(Self::Byte), + "SHORT" => Some(Self::Short), + "INT" => Some(Self::Int), + "LONG" => Some(Self::Long), + "FLOAT" => Some(Self::Float), + "DOUBLE" => Some(Self::Double), + "STRING" => Some(Self::String), + "BINARY" => Some(Self::Binary), + "TIMESTAMP" => Some(Self::Timestamp), + "LIST" => Some(Self::List), + "MAP" => Some(Self::Map), + "STRUCT" => Some(Self::Struct), + "UNION" => Some(Self::Union), + "DECIMAL" => Some(Self::Decimal), + "DATE" => Some(Self::Date), + "VARCHAR" => Some(Self::Varchar), + "CHAR" => Some(Self::Char), + "TIMESTAMP_INSTANT" => Some(Self::TimestampInstant), + _ => None, + } + } + } +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct StripeInformation { + /// the global file offset of the start of the stripe + #[prost(uint64, optional, tag = "1")] + pub offset: ::core::option::Option, + /// the number of bytes of index + #[prost(uint64, optional, tag = "2")] + pub index_length: ::core::option::Option, + /// the number of bytes of data + #[prost(uint64, optional, tag = "3")] + pub data_length: ::core::option::Option, + /// the number of bytes in the stripe footer + #[prost(uint64, optional, tag = "4")] + pub footer_length: ::core::option::Option, + /// the number of rows in this stripe + #[prost(uint64, optional, tag = "5")] + pub number_of_rows: ::core::option::Option, + /// If this is present, the reader should use this value for the encryption + /// stripe id for setting the encryption IV. Otherwise, the reader should + /// use one larger than the previous stripe's encryptStripeId. + /// For unmerged ORC files, the first stripe will use 1 and the rest of the + /// stripes won't have it set. For merged files, the stripe information + /// will be copied from their original files and thus the first stripe of + /// each of the input files will reset it to 1. + /// Note that 1 was choosen, because protobuf v3 doesn't serialize + /// primitive types that are the default (eg. 0). + #[prost(uint64, optional, tag = "6")] + pub encrypt_stripe_id: ::core::option::Option, + /// For each encryption variant, the new encrypted local key to use + /// until we find a replacement. + #[prost(bytes = "vec", repeated, tag = "7")] + pub encrypted_local_keys: ::prost::alloc::vec::Vec<::prost::alloc::vec::Vec>, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct UserMetadataItem { + #[prost(string, optional, tag = "1")] + pub name: ::core::option::Option<::prost::alloc::string::String>, + #[prost(bytes = "vec", optional, tag = "2")] + pub value: ::core::option::Option<::prost::alloc::vec::Vec>, +} +/// StripeStatistics (1 per a stripe), which each contain the +/// ColumnStatistics for each column. +/// This message type is only used in ORC v0 and v1. +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct StripeStatistics { + #[prost(message, repeated, tag = "1")] + pub col_stats: ::prost::alloc::vec::Vec, +} +/// This message type is only used in ORC v0 and v1. +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct Metadata { + #[prost(message, repeated, tag = "1")] + pub stripe_stats: ::prost::alloc::vec::Vec, +} +/// In ORC v2 (and for encrypted columns in v1), each column has +/// their column statistics written separately. +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ColumnarStripeStatistics { + /// one value for each stripe in the file + #[prost(message, repeated, tag = "1")] + pub col_stats: ::prost::alloc::vec::Vec, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct FileStatistics { + #[prost(message, repeated, tag = "1")] + pub column: ::prost::alloc::vec::Vec, +} +/// How was the data masked? This isn't necessary for reading the file, but +/// is documentation about how the file was written. +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct DataMask { + /// the kind of masking, which may include third party masks + #[prost(string, optional, tag = "1")] + pub name: ::core::option::Option<::prost::alloc::string::String>, + /// parameters for the mask + #[prost(string, repeated, tag = "2")] + pub mask_parameters: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, + /// the unencrypted column roots this mask was applied to + #[prost(uint32, repeated, tag = "3")] + pub columns: ::prost::alloc::vec::Vec, +} +/// Information about the encryption keys. +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct EncryptionKey { + #[prost(string, optional, tag = "1")] + pub key_name: ::core::option::Option<::prost::alloc::string::String>, + #[prost(uint32, optional, tag = "2")] + pub key_version: ::core::option::Option, + #[prost(enumeration = "EncryptionAlgorithm", optional, tag = "3")] + pub algorithm: ::core::option::Option, +} +/// The description of an encryption variant. +/// Each variant is a single subtype that is encrypted with a single key. +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct EncryptionVariant { + /// the column id of the root + #[prost(uint32, optional, tag = "1")] + pub root: ::core::option::Option, + /// The master key that was used to encrypt the local key, referenced as + /// an index into the Encryption.key list. + #[prost(uint32, optional, tag = "2")] + pub key: ::core::option::Option, + /// the encrypted key for the file footer + #[prost(bytes = "vec", optional, tag = "3")] + pub encrypted_key: ::core::option::Option<::prost::alloc::vec::Vec>, + /// the stripe statistics for this variant + #[prost(message, repeated, tag = "4")] + pub stripe_statistics: ::prost::alloc::vec::Vec, + /// encrypted file statistics as a FileStatistics + #[prost(bytes = "vec", optional, tag = "5")] + pub file_statistics: ::core::option::Option<::prost::alloc::vec::Vec>, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct Encryption { + /// all of the masks used in this file + #[prost(message, repeated, tag = "1")] + pub mask: ::prost::alloc::vec::Vec, + /// all of the keys used in this file + #[prost(message, repeated, tag = "2")] + pub key: ::prost::alloc::vec::Vec, + /// The encrypted variants. + /// Readers should prefer the first variant that the user has access to + /// the corresponding key. If they don't have access to any of the keys, + /// they should get the unencrypted masked data. + #[prost(message, repeated, tag = "3")] + pub variants: ::prost::alloc::vec::Vec, + /// How are the local keys encrypted? + #[prost(enumeration = "KeyProviderKind", optional, tag = "4")] + pub key_provider: ::core::option::Option, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct Footer { + #[prost(uint64, optional, tag = "1")] + pub header_length: ::core::option::Option, + #[prost(uint64, optional, tag = "2")] + pub content_length: ::core::option::Option, + #[prost(message, repeated, tag = "3")] + pub stripes: ::prost::alloc::vec::Vec, + #[prost(message, repeated, tag = "4")] + pub types: ::prost::alloc::vec::Vec, + #[prost(message, repeated, tag = "5")] + pub metadata: ::prost::alloc::vec::Vec, + #[prost(uint64, optional, tag = "6")] + pub number_of_rows: ::core::option::Option, + #[prost(message, repeated, tag = "7")] + pub statistics: ::prost::alloc::vec::Vec, + #[prost(uint32, optional, tag = "8")] + pub row_index_stride: ::core::option::Option, + /// Each implementation that writes ORC files should register for a code + /// 0 = ORC Java + /// 1 = ORC C++ + /// 2 = Presto + /// 3 = Scritchley Go from + /// 4 = Trino + #[prost(uint32, optional, tag = "9")] + pub writer: ::core::option::Option, + /// information about the encryption in this file + #[prost(message, optional, tag = "10")] + pub encryption: ::core::option::Option, + #[prost(enumeration = "CalendarKind", optional, tag = "11")] + pub calendar: ::core::option::Option, + /// informative description about the version of the software that wrote + /// the file. It is assumed to be within a given writer, so for example + /// ORC 1.7.2 = "1.7.2". It may include suffixes, such as "-SNAPSHOT". + #[prost(string, optional, tag = "12")] + pub software_version: ::core::option::Option<::prost::alloc::string::String>, +} +/// Serialized length must be less that 255 bytes +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct PostScript { + #[prost(uint64, optional, tag = "1")] + pub footer_length: ::core::option::Option, + #[prost(enumeration = "CompressionKind", optional, tag = "2")] + pub compression: ::core::option::Option, + #[prost(uint64, optional, tag = "3")] + pub compression_block_size: ::core::option::Option, + /// the version of the file format + /// \[0, 11\] = Hive 0.11 + /// \[0, 12\] = Hive 0.12 + #[prost(uint32, repeated, tag = "4")] + pub version: ::prost::alloc::vec::Vec, + #[prost(uint64, optional, tag = "5")] + pub metadata_length: ::core::option::Option, + /// The version of the writer that wrote the file. This number is + /// updated when we make fixes or large changes to the writer so that + /// readers can detect whether a given bug is present in the data. + /// + /// Only the Java ORC writer may use values under 6 (or missing) so that + /// readers that predate ORC-202 treat the new writers correctly. Each + /// writer should assign their own sequence of versions starting from 6. + /// + /// Version of the ORC Java writer: + /// 0 = original + /// 1 = HIVE-8732 fixed (fixed stripe/file maximum statistics & + /// string statistics use utf8 for min/max) + /// 2 = HIVE-4243 fixed (use real column names from Hive tables) + /// 3 = HIVE-12055 added (vectorized writer implementation) + /// 4 = HIVE-13083 fixed (decimals write present stream correctly) + /// 5 = ORC-101 fixed (bloom filters use utf8 consistently) + /// 6 = ORC-135 fixed (timestamp statistics use utc) + /// 7 = ORC-517 fixed (decimal64 min/max incorrect) + /// 8 = ORC-203 added (trim very long string statistics) + /// 9 = ORC-14 added (column encryption) + /// + /// Version of the ORC C++ writer: + /// 6 = original + /// + /// Version of the Presto writer: + /// 6 = original + /// + /// Version of the Scritchley Go writer: + /// 6 = original + /// + /// Version of the Trino writer: + /// 6 = original + /// + #[prost(uint32, optional, tag = "6")] + pub writer_version: ::core::option::Option, + /// the number of bytes in the encrypted stripe statistics + #[prost(uint64, optional, tag = "7")] + pub stripe_statistics_length: ::core::option::Option, + /// Leave this last in the record + #[prost(string, optional, tag = "8000")] + pub magic: ::core::option::Option<::prost::alloc::string::String>, +} +/// The contents of the file tail that must be serialized. +/// This gets serialized as part of OrcSplit, also used by footer cache. +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct FileTail { + #[prost(message, optional, tag = "1")] + pub postscript: ::core::option::Option, + #[prost(message, optional, tag = "2")] + pub footer: ::core::option::Option