From 47bb450f3aa14f0cbcdcf0b6db6c86c76cdf5e4d Mon Sep 17 00:00:00 2001 From: Jeffrey <22608443+Jefffrey@users.noreply.github.com> Date: Fri, 3 Nov 2023 22:25:17 +1100 Subject: [PATCH] Crate for generating proto.rs from orc_proto.proto (#3) --- Cargo.lock | 180 ++++++++++++- Cargo.toml | 3 + format/orc_proto.proto | 451 +++++++++++++++++++++++++++++++ gen/Cargo.toml | 28 ++ gen/src/main.rs | 46 ++++ regen.sh | 21 ++ rustfmt.toml | 3 +- src/arrow_reader.rs | 3 +- src/arrow_reader/column.rs | 2 +- src/proto.rs | 538 ++++++++++++++++++++++++++++++++++++- src/reader/schema.rs | 2 + tests/basic/main.rs | 50 ++-- typos.toml | 6 +- 13 files changed, 1291 insertions(+), 42 deletions(-) create mode 100644 format/orc_proto.proto create mode 100644 gen/Cargo.toml create mode 100644 gen/src/main.rs create mode 100755 regen.sh diff --git a/Cargo.lock b/Cargo.lock index 8a2216af..b2bea7c1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -301,6 +301,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bitflags" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" + [[package]] name = "bumpalo" version = "3.14.0" @@ -429,7 +435,7 @@ dependencies = [ "futures-util", "lazy_static", "paste", - "prost", + "prost 0.11.9", "snafu", "tokio", "zigzag", @@ -454,19 +460,41 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" +[[package]] +name = "errno" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860" +dependencies = [ + "libc", + "windows-sys", +] + [[package]] name = "fallible-streaming-iterator" version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" +[[package]] +name = "fastrand" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" + +[[package]] +name = "fixedbitset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" + [[package]] name = "flatbuffers" version = "23.5.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4dac53e22462d78c16d64a1cd22371b54cc3fe94aa15e7886a2fa6e5d1ab8640" dependencies = [ - "bitflags", + "bitflags 1.3.2", "rustc_version", ] @@ -557,6 +585,13 @@ dependencies = [ "slab", ] +[[package]] +name = "gen" +version = "0.1.0" +dependencies = [ + "prost-build", +] + [[package]] name = "getrandom" version = "0.2.10" @@ -597,6 +632,15 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +[[package]] +name = "home" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5444c27eef6923071f7ebcc33e3444508466a76f7a2b93da00ed6e19f30c1ddb" +dependencies = [ + "windows-sys", +] + [[package]] name = "iana-time-zone" version = "0.1.58" @@ -745,6 +789,12 @@ version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" +[[package]] +name = "linux-raw-sys" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f" + [[package]] name = "log" version = "0.4.20" @@ -766,6 +816,12 @@ dependencies = [ "adler", ] +[[package]] +name = "multimap" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" + [[package]] name = "num" version = "0.4.1" @@ -864,6 +920,16 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" +[[package]] +name = "petgraph" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1d3afd2628e69da2be385eb6f2fd57c8ac7977ceeff6dc166ff1657b0e386a9" +dependencies = [ + "fixedbitset", + "indexmap", +] + [[package]] name = "pin-project-lite" version = "0.2.13" @@ -898,7 +964,37 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b82eaa1d779e9a4bc1c3217db8ffbeabaae1dca241bf70183242128d48681cd" dependencies = [ "bytes", - "prost-derive", + "prost-derive 0.11.9", +] + +[[package]] +name = "prost" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4fdd22f3b9c31b53c060df4a0613a1c7f062d4115a2b984dd15b1858f7e340d" +dependencies = [ + "bytes", + "prost-derive 0.12.1", +] + +[[package]] +name = "prost-build" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bdf592881d821b83d471f8af290226c8d51402259e9bb5be7f9f8bdebbb11ac" +dependencies = [ + "bytes", + "heck", + "itertools", + "log", + "multimap", + "once_cell", + "petgraph", + "prost 0.12.1", + "prost-types", + "regex", + "tempfile", + "which", ] [[package]] @@ -914,6 +1010,28 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "prost-derive" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "265baba7fabd416cf5078179f7d2cbeca4ce7a9041111900675ea7c4cb8a4c32" +dependencies = [ + "anyhow", + "itertools", + "proc-macro2", + "quote", + "syn 2.0.38", +] + +[[package]] +name = "prost-types" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e081b29f63d83a4bc75cfc9f3fe424f9156cf92d8a4f0c9407cce9a1b67327cf" +dependencies = [ + "prost 0.12.1", +] + [[package]] name = "quote" version = "1.0.33" @@ -923,6 +1041,15 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "redox_syscall" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +dependencies = [ + "bitflags 1.3.2", +] + [[package]] name = "regex" version = "1.10.2" @@ -973,6 +1100,19 @@ dependencies = [ "semver", ] +[[package]] +name = "rustix" +version = "0.38.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b426b0506e5d50a7d8dafcf2e81471400deb602392c7dd110815afb4eaf02a3" +dependencies = [ + "bitflags 2.4.1", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + [[package]] name = "rustversion" version = "1.0.14" @@ -1100,6 +1240,19 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tempfile" +version = "3.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ef1adac450ad7f4b3c28589471ade84f25f731a7a0fe30d71dfa9f60fd808e5" +dependencies = [ + "cfg-if", + "fastrand", + "redox_syscall", + "rustix", + "windows-sys", +] + [[package]] name = "tiny-keccak" version = "2.0.2" @@ -1210,6 +1363,18 @@ version = "0.2.88" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0d046c5d029ba91a1ed14da14dca44b68bf2f124cfbaf741c54151fdb3e0750b" +[[package]] +name = "which" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" +dependencies = [ + "either", + "home", + "once_cell", + "rustix", +] + [[package]] name = "windows-core" version = "0.51.1" @@ -1219,6 +1384,15 @@ dependencies = [ "windows-targets", ] +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets", +] + [[package]] name = "windows-targets" version = "0.48.5" diff --git a/Cargo.toml b/Cargo.toml index c73c15bb..31add2c7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,3 +1,6 @@ +[workspace] +members = ["gen"] + [package] name = "datafusion-orc" version = "0.2.43" diff --git a/format/orc_proto.proto b/format/orc_proto.proto new file mode 100644 index 00000000..ff05657a --- /dev/null +++ b/format/orc_proto.proto @@ -0,0 +1,451 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = "proto2"; + +package orc.proto; + +option java_package = "org.apache.orc"; + +message IntegerStatistics { + optional sint64 minimum = 1; + optional sint64 maximum = 2; + optional sint64 sum = 3; +} + +message DoubleStatistics { + optional double minimum = 1; + optional double maximum = 2; + optional double sum = 3; +} + +message StringStatistics { + optional string minimum = 1; + optional string maximum = 2; + // sum will store the total length of all strings in a stripe + optional sint64 sum = 3; + // If the minimum or maximum value was longer than 1024 bytes, store a lower or upper + // bound instead of the minimum or maximum values above. + optional string lowerBound = 4; + optional string upperBound = 5; +} + +message BucketStatistics { + repeated uint64 count = 1 [packed=true]; +} + +message DecimalStatistics { + optional string minimum = 1; + optional string maximum = 2; + optional string sum = 3; +} + +message DateStatistics { + // min,max values saved as days since epoch + optional sint32 minimum = 1; + optional sint32 maximum = 2; +} + +message TimestampStatistics { + // min,max values saved as milliseconds since epoch + optional sint64 minimum = 1; + optional sint64 maximum = 2; + optional sint64 minimumUtc = 3; + optional sint64 maximumUtc = 4; + // store the lower 6 TS digits for min/max to achieve nanosecond precision + optional int32 minimumNanos = 5; + optional int32 maximumNanos = 6; +} + +message BinaryStatistics { + // sum will store the total binary blob length in a stripe + optional sint64 sum = 1; +} + +// Statistics for list and map +message CollectionStatistics { + optional uint64 minChildren = 1; + optional uint64 maxChildren = 2; + optional uint64 totalChildren = 3; +} + +message ColumnStatistics { + optional uint64 numberOfValues = 1; + optional IntegerStatistics intStatistics = 2; + optional DoubleStatistics doubleStatistics = 3; + optional StringStatistics stringStatistics = 4; + optional BucketStatistics bucketStatistics = 5; + optional DecimalStatistics decimalStatistics = 6; + optional DateStatistics dateStatistics = 7; + optional BinaryStatistics binaryStatistics = 8; + optional TimestampStatistics timestampStatistics = 9; + optional bool hasNull = 10; + optional uint64 bytesOnDisk = 11; + optional CollectionStatistics collectionStatistics = 12; +} + +message RowIndexEntry { + repeated uint64 positions = 1 [packed=true]; + optional ColumnStatistics statistics = 2; +} + +message RowIndex { + repeated RowIndexEntry entry = 1; +} + +message BloomFilter { + optional uint32 numHashFunctions = 1; + repeated fixed64 bitset = 2; + optional bytes utf8bitset = 3; +} + +message BloomFilterIndex { + repeated BloomFilter bloomFilter = 1; +} + +message Stream { + // if you add new index stream kinds, you need to make sure to update + // StreamName to ensure it is added to the stripe in the right area + enum Kind { + PRESENT = 0; + DATA = 1; + LENGTH = 2; + DICTIONARY_DATA = 3; + DICTIONARY_COUNT = 4; + SECONDARY = 5; + ROW_INDEX = 6; + BLOOM_FILTER = 7; + BLOOM_FILTER_UTF8 = 8; + // Virtual stream kinds to allocate space for encrypted index and data. + ENCRYPTED_INDEX = 9; + ENCRYPTED_DATA = 10; + + // stripe statistics streams + STRIPE_STATISTICS = 100; + // A virtual stream kind that is used for setting the encryption IV. + FILE_STATISTICS = 101; + } + optional Kind kind = 1; + optional uint32 column = 2; + optional uint64 length = 3; +} + +message ColumnEncoding { + enum Kind { + DIRECT = 0; + DICTIONARY = 1; + DIRECT_V2 = 2; + DICTIONARY_V2 = 3; + } + optional Kind kind = 1; + optional uint32 dictionarySize = 2; + + // The encoding of the bloom filters for this column: + // 0 or missing = none or original + // 1 = ORC-135 (utc for timestamps) + optional uint32 bloomEncoding = 3; +} + +message StripeEncryptionVariant { + repeated Stream streams = 1; + repeated ColumnEncoding encoding = 2; +} + +// each stripe looks like: +// index streams +// unencrypted +// variant 1..N +// data streams +// unencrypted +// variant 1..N +// footer + +message StripeFooter { + repeated Stream streams = 1; + repeated ColumnEncoding columns = 2; + optional string writerTimezone = 3; + // one for each column encryption variant + repeated StripeEncryptionVariant encryption = 4; +} + +// the file tail looks like: +// encrypted stripe statistics: ColumnarStripeStatistics (order by variant) +// stripe statistics: Metadata +// footer: Footer +// postscript: PostScript +// psLen: byte + +message StringPair { + optional string key = 1; + optional string value = 2; +} + +message Type { + enum Kind { + BOOLEAN = 0; + BYTE = 1; + SHORT = 2; + INT = 3; + LONG = 4; + FLOAT = 5; + DOUBLE = 6; + STRING = 7; + BINARY = 8; + TIMESTAMP = 9; + LIST = 10; + MAP = 11; + STRUCT = 12; + UNION = 13; + DECIMAL = 14; + DATE = 15; + VARCHAR = 16; + CHAR = 17; + TIMESTAMP_INSTANT = 18; + } + optional Kind kind = 1; + repeated uint32 subtypes = 2 [packed=true]; + repeated string fieldNames = 3; + optional uint32 maximumLength = 4; + optional uint32 precision = 5; + optional uint32 scale = 6; + repeated StringPair attributes = 7; +} + +message StripeInformation { + // the global file offset of the start of the stripe + optional uint64 offset = 1; + // the number of bytes of index + optional uint64 indexLength = 2; + // the number of bytes of data + optional uint64 dataLength = 3; + // the number of bytes in the stripe footer + optional uint64 footerLength = 4; + // the number of rows in this stripe + optional uint64 numberOfRows = 5; + // If this is present, the reader should use this value for the encryption + // stripe id for setting the encryption IV. Otherwise, the reader should + // use one larger than the previous stripe's encryptStripeId. + // For unmerged ORC files, the first stripe will use 1 and the rest of the + // stripes won't have it set. For merged files, the stripe information + // will be copied from their original files and thus the first stripe of + // each of the input files will reset it to 1. + // Note that 1 was choosen, because protobuf v3 doesn't serialize + // primitive types that are the default (eg. 0). + optional uint64 encryptStripeId = 6; + // For each encryption variant, the new encrypted local key to use + // until we find a replacement. + repeated bytes encryptedLocalKeys = 7; +} + +message UserMetadataItem { + optional string name = 1; + optional bytes value = 2; +} + +// StripeStatistics (1 per a stripe), which each contain the +// ColumnStatistics for each column. +// This message type is only used in ORC v0 and v1. +message StripeStatistics { + repeated ColumnStatistics colStats = 1; +} + +// This message type is only used in ORC v0 and v1. +message Metadata { + repeated StripeStatistics stripeStats = 1; +} + +// In ORC v2 (and for encrypted columns in v1), each column has +// their column statistics written separately. +message ColumnarStripeStatistics { + // one value for each stripe in the file + repeated ColumnStatistics colStats = 1; +} + +enum EncryptionAlgorithm { + UNKNOWN_ENCRYPTION = 0; // used for detecting future algorithms + AES_CTR_128 = 1; + AES_CTR_256 = 2; +} + +message FileStatistics { + repeated ColumnStatistics column = 1; +} + +// How was the data masked? This isn't necessary for reading the file, but +// is documentation about how the file was written. +message DataMask { + // the kind of masking, which may include third party masks + optional string name = 1; + // parameters for the mask + repeated string maskParameters = 2; + // the unencrypted column roots this mask was applied to + repeated uint32 columns = 3 [packed = true]; +} + +// Information about the encryption keys. +message EncryptionKey { + optional string keyName = 1; + optional uint32 keyVersion = 2; + optional EncryptionAlgorithm algorithm = 3; +} + +// The description of an encryption variant. +// Each variant is a single subtype that is encrypted with a single key. +message EncryptionVariant { + // the column id of the root + optional uint32 root = 1; + // The master key that was used to encrypt the local key, referenced as + // an index into the Encryption.key list. + optional uint32 key = 2; + // the encrypted key for the file footer + optional bytes encryptedKey = 3; + // the stripe statistics for this variant + repeated Stream stripeStatistics = 4; + // encrypted file statistics as a FileStatistics + optional bytes fileStatistics = 5; +} + +// Which KeyProvider encrypted the local keys. +enum KeyProviderKind { + UNKNOWN = 0; + HADOOP = 1; + AWS = 2; + GCP = 3; + AZURE = 4; +} + +message Encryption { + // all of the masks used in this file + repeated DataMask mask = 1; + // all of the keys used in this file + repeated EncryptionKey key = 2; + // The encrypted variants. + // Readers should prefer the first variant that the user has access to + // the corresponding key. If they don't have access to any of the keys, + // they should get the unencrypted masked data. + repeated EncryptionVariant variants = 3; + // How are the local keys encrypted? + optional KeyProviderKind keyProvider = 4; +} + +enum CalendarKind { + UNKNOWN_CALENDAR = 0; + // A hybrid Julian/Gregorian calendar with a cutover point in October 1582. + JULIAN_GREGORIAN = 1; + // A calendar that extends the Gregorian calendar back forever. + PROLEPTIC_GREGORIAN = 2; +} + +message Footer { + optional uint64 headerLength = 1; + optional uint64 contentLength = 2; + repeated StripeInformation stripes = 3; + repeated Type types = 4; + repeated UserMetadataItem metadata = 5; + optional uint64 numberOfRows = 6; + repeated ColumnStatistics statistics = 7; + optional uint32 rowIndexStride = 8; + + // Each implementation that writes ORC files should register for a code + // 0 = ORC Java + // 1 = ORC C++ + // 2 = Presto + // 3 = Scritchley Go from https://github.com/scritchley/orc + // 4 = Trino + optional uint32 writer = 9; + + // information about the encryption in this file + optional Encryption encryption = 10; + optional CalendarKind calendar = 11; + + // informative description about the version of the software that wrote + // the file. It is assumed to be within a given writer, so for example + // ORC 1.7.2 = "1.7.2". It may include suffixes, such as "-SNAPSHOT". + optional string softwareVersion = 12; +} + +enum CompressionKind { + NONE = 0; + ZLIB = 1; + SNAPPY = 2; + LZO = 3; + LZ4 = 4; + ZSTD = 5; +} + +// Serialized length must be less that 255 bytes +message PostScript { + optional uint64 footerLength = 1; + optional CompressionKind compression = 2; + optional uint64 compressionBlockSize = 3; + // the version of the file format + // [0, 11] = Hive 0.11 + // [0, 12] = Hive 0.12 + repeated uint32 version = 4 [packed = true]; + optional uint64 metadataLength = 5; + + // The version of the writer that wrote the file. This number is + // updated when we make fixes or large changes to the writer so that + // readers can detect whether a given bug is present in the data. + // + // Only the Java ORC writer may use values under 6 (or missing) so that + // readers that predate ORC-202 treat the new writers correctly. Each + // writer should assign their own sequence of versions starting from 6. + // + // Version of the ORC Java writer: + // 0 = original + // 1 = HIVE-8732 fixed (fixed stripe/file maximum statistics & + // string statistics use utf8 for min/max) + // 2 = HIVE-4243 fixed (use real column names from Hive tables) + // 3 = HIVE-12055 added (vectorized writer implementation) + // 4 = HIVE-13083 fixed (decimals write present stream correctly) + // 5 = ORC-101 fixed (bloom filters use utf8 consistently) + // 6 = ORC-135 fixed (timestamp statistics use utc) + // 7 = ORC-517 fixed (decimal64 min/max incorrect) + // 8 = ORC-203 added (trim very long string statistics) + // 9 = ORC-14 added (column encryption) + // + // Version of the ORC C++ writer: + // 6 = original + // + // Version of the Presto writer: + // 6 = original + // + // Version of the Scritchley Go writer: + // 6 = original + // + // Version of the Trino writer: + // 6 = original + // + optional uint32 writerVersion = 6; + + // the number of bytes in the encrypted stripe statistics + optional uint64 stripeStatisticsLength = 7; + + // Leave this last in the record + optional string magic = 8000; +} + +// The contents of the file tail that must be serialized. +// This gets serialized as part of OrcSplit, also used by footer cache. +message FileTail { + optional PostScript postscript = 1; + optional Footer footer = 2; + optional uint64 fileLength = 3; + optional uint64 postscriptLength = 4; +} diff --git a/gen/Cargo.toml b/gen/Cargo.toml new file mode 100644 index 00000000..b9f2767e --- /dev/null +++ b/gen/Cargo.toml @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "gen" +description = "Code generation for datafusion-orc" +version = "0.1.0" +edition = "2021" +rust-version = "1.70" +license = "Apache-2.0" +publish = false + +[dependencies] +prost-build = { version = "=0.12.1", default-features = false } diff --git a/gen/src/main.rs b/gen/src/main.rs new file mode 100644 index 00000000..c1cba08b --- /dev/null +++ b/gen/src/main.rs @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::fs::{remove_file, OpenOptions}; +use std::io::{Read, Write}; + +fn main() -> Result<(), Box> { + prost_build::Config::new() + .out_dir("src/") + .compile_well_known_types() + .extern_path(".google.protobuf", "::pbjson_types") + .compile_protos(&["format/orc_proto.proto"], &["format"])?; + + // read file contents to string + let mut file = OpenOptions::new().read(true).open("src/orc.proto.rs")?; + let mut buffer = String::new(); + file.read_to_string(&mut buffer)?; + // append warning that file was auto-generate + let mut file = OpenOptions::new() + .write(true) + .truncate(true) + .create(true) + .open("src/proto.rs")?; + file.write_all("// This file was automatically generated through the regen.sh script, and should not be edited.\n\n".as_bytes())?; + file.write_all(buffer.as_bytes())?; + + // since we renamed file to proto.rs to avoid period in the name + remove_file("src/orc.proto.rs")?; + + // As the proto file is checked in, the build should not fail if the file is not found + Ok(()) +} diff --git a/regen.sh b/regen.sh new file mode 100755 index 00000000..d83f9d58 --- /dev/null +++ b/regen.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +cd $SCRIPT_DIR && cargo run --manifest-path gen/Cargo.toml diff --git a/rustfmt.toml b/rustfmt.toml index 64d94def..3a26366d 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -1,2 +1 @@ -group_imports = "StdExternalCrate" -imports_granularity = "Module" +edition = "2021" diff --git a/src/arrow_reader.rs b/src/arrow_reader.rs index a8903a25..6fe4cd4e 100644 --- a/src/arrow_reader.rs +++ b/src/arrow_reader.rs @@ -321,7 +321,7 @@ impl NaiveStripeDecoder { let mut decoders = Vec::with_capacity(stripe.columns.len()); let number_of_rows = stripe .columns - .get(0) + .first() .map(|c| c.number_of_rows()) .unwrap_or_default(); for col in &stripe.columns { @@ -346,6 +346,7 @@ impl NaiveStripeDecoder { crate::proto::r#type::Kind::Date => Decoder::Date(new_date_iter(col)?), crate::proto::r#type::Kind::Varchar => Decoder::String(StringDecoder::new(col)?), crate::proto::r#type::Kind::Char => Decoder::String(StringDecoder::new(col)?), + crate::proto::r#type::Kind::TimestampInstant => todo!(), }; decoders.push(decoder); } diff --git a/src/arrow_reader/column.rs b/src/arrow_reader/column.rs index 5f0dff86..3ac43871 100644 --- a/src/arrow_reader/column.rs +++ b/src/arrow_reader/column.rs @@ -169,7 +169,7 @@ impl Column { pub fn encoding(&self) -> ColumnEncoding { let column = self.column.column_id(); - self.footer.columns[column] + self.footer.columns[column].clone() } pub fn number_of_rows(&self) -> usize { diff --git a/src/proto.rs b/src/proto.rs index 4a0956aa..07545a68 100644 --- a/src/proto.rs +++ b/src/proto.rs @@ -1,5 +1,6 @@ -// Copied from https://github.com/DataEngineeringLabs/orc-format/blob/416490db0214fc51d53289253c0ee91f7fc9bc17/src/proto.rs -// TODO(weny): Considers using the official proto file? +// This file was automatically generated through the regen.sh script, and should not be edited. + +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct IntegerStatistics { #[prost(sint64, optional, tag = "1")] @@ -9,6 +10,7 @@ pub struct IntegerStatistics { #[prost(sint64, optional, tag = "3")] pub sum: ::core::option::Option, } +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct DoubleStatistics { #[prost(double, optional, tag = "1")] @@ -18,6 +20,7 @@ pub struct DoubleStatistics { #[prost(double, optional, tag = "3")] pub sum: ::core::option::Option, } +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct StringStatistics { #[prost(string, optional, tag = "1")] @@ -27,12 +30,20 @@ pub struct StringStatistics { /// sum will store the total length of all strings in a stripe #[prost(sint64, optional, tag = "3")] pub sum: ::core::option::Option, + /// If the minimum or maximum value was longer than 1024 bytes, store a lower or upper + /// bound instead of the minimum or maximum values above. + #[prost(string, optional, tag = "4")] + pub lower_bound: ::core::option::Option<::prost::alloc::string::String>, + #[prost(string, optional, tag = "5")] + pub upper_bound: ::core::option::Option<::prost::alloc::string::String>, } +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct BucketStatistics { #[prost(uint64, repeated, tag = "1")] pub count: ::prost::alloc::vec::Vec, } +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct DecimalStatistics { #[prost(string, optional, tag = "1")] @@ -42,6 +53,7 @@ pub struct DecimalStatistics { #[prost(string, optional, tag = "3")] pub sum: ::core::option::Option<::prost::alloc::string::String>, } +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct DateStatistics { /// min,max values saved as days since epoch @@ -50,6 +62,7 @@ pub struct DateStatistics { #[prost(sint32, optional, tag = "2")] pub maximum: ::core::option::Option, } +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct TimestampStatistics { /// min,max values saved as milliseconds since epoch @@ -57,13 +70,35 @@ pub struct TimestampStatistics { pub minimum: ::core::option::Option, #[prost(sint64, optional, tag = "2")] pub maximum: ::core::option::Option, + #[prost(sint64, optional, tag = "3")] + pub minimum_utc: ::core::option::Option, + #[prost(sint64, optional, tag = "4")] + pub maximum_utc: ::core::option::Option, + /// store the lower 6 TS digits for min/max to achieve nanosecond precision + #[prost(int32, optional, tag = "5")] + pub minimum_nanos: ::core::option::Option, + #[prost(int32, optional, tag = "6")] + pub maximum_nanos: ::core::option::Option, } +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct BinaryStatistics { /// sum will store the total binary blob length in a stripe #[prost(sint64, optional, tag = "1")] pub sum: ::core::option::Option, } +/// Statistics for list and map +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct CollectionStatistics { + #[prost(uint64, optional, tag = "1")] + pub min_children: ::core::option::Option, + #[prost(uint64, optional, tag = "2")] + pub max_children: ::core::option::Option, + #[prost(uint64, optional, tag = "3")] + pub total_children: ::core::option::Option, +} +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct ColumnStatistics { #[prost(uint64, optional, tag = "1")] @@ -86,7 +121,12 @@ pub struct ColumnStatistics { pub timestamp_statistics: ::core::option::Option, #[prost(bool, optional, tag = "10")] pub has_null: ::core::option::Option, + #[prost(uint64, optional, tag = "11")] + pub bytes_on_disk: ::core::option::Option, + #[prost(message, optional, tag = "12")] + pub collection_statistics: ::core::option::Option, } +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct RowIndexEntry { #[prost(uint64, repeated, tag = "1")] @@ -94,23 +134,29 @@ pub struct RowIndexEntry { #[prost(message, optional, tag = "2")] pub statistics: ::core::option::Option, } +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct RowIndex { #[prost(message, repeated, tag = "1")] pub entry: ::prost::alloc::vec::Vec, } +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct BloomFilter { #[prost(uint32, optional, tag = "1")] pub num_hash_functions: ::core::option::Option, #[prost(fixed64, repeated, packed = "false", tag = "2")] pub bitset: ::prost::alloc::vec::Vec, + #[prost(bytes = "vec", optional, tag = "3")] + pub utf8bitset: ::core::option::Option<::prost::alloc::vec::Vec>, } +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct BloomFilterIndex { #[prost(message, repeated, tag = "1")] pub bloom_filter: ::prost::alloc::vec::Vec, } +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct Stream { #[prost(enumeration = "stream::Kind", optional, tag = "1")] @@ -135,14 +181,70 @@ pub mod stream { Secondary = 5, RowIndex = 6, BloomFilter = 7, + BloomFilterUtf8 = 8, + /// Virtual stream kinds to allocate space for encrypted index and data. + EncryptedIndex = 9, + EncryptedData = 10, + /// stripe statistics streams + StripeStatistics = 100, + /// A virtual stream kind that is used for setting the encryption IV. + FileStatistics = 101, + } + impl Kind { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + Kind::Present => "PRESENT", + Kind::Data => "DATA", + Kind::Length => "LENGTH", + Kind::DictionaryData => "DICTIONARY_DATA", + Kind::DictionaryCount => "DICTIONARY_COUNT", + Kind::Secondary => "SECONDARY", + Kind::RowIndex => "ROW_INDEX", + Kind::BloomFilter => "BLOOM_FILTER", + Kind::BloomFilterUtf8 => "BLOOM_FILTER_UTF8", + Kind::EncryptedIndex => "ENCRYPTED_INDEX", + Kind::EncryptedData => "ENCRYPTED_DATA", + Kind::StripeStatistics => "STRIPE_STATISTICS", + Kind::FileStatistics => "FILE_STATISTICS", + } + } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "PRESENT" => Some(Self::Present), + "DATA" => Some(Self::Data), + "LENGTH" => Some(Self::Length), + "DICTIONARY_DATA" => Some(Self::DictionaryData), + "DICTIONARY_COUNT" => Some(Self::DictionaryCount), + "SECONDARY" => Some(Self::Secondary), + "ROW_INDEX" => Some(Self::RowIndex), + "BLOOM_FILTER" => Some(Self::BloomFilter), + "BLOOM_FILTER_UTF8" => Some(Self::BloomFilterUtf8), + "ENCRYPTED_INDEX" => Some(Self::EncryptedIndex), + "ENCRYPTED_DATA" => Some(Self::EncryptedData), + "STRIPE_STATISTICS" => Some(Self::StripeStatistics), + "FILE_STATISTICS" => Some(Self::FileStatistics), + _ => None, + } + } } } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] pub struct ColumnEncoding { #[prost(enumeration = "column_encoding::Kind", optional, tag = "1")] pub kind: ::core::option::Option, #[prost(uint32, optional, tag = "2")] pub dictionary_size: ::core::option::Option, + /// The encoding of the bloom filters for this column: + /// 0 or missing = none or original + /// 1 = ORC-135 (utc for timestamps) + #[prost(uint32, optional, tag = "3")] + pub bloom_encoding: ::core::option::Option, } /// Nested message and enum types in `ColumnEncoding`. pub mod column_encoding { @@ -154,7 +256,49 @@ pub mod column_encoding { DirectV2 = 2, DictionaryV2 = 3, } + impl Kind { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + Kind::Direct => "DIRECT", + Kind::Dictionary => "DICTIONARY", + Kind::DirectV2 => "DIRECT_V2", + Kind::DictionaryV2 => "DICTIONARY_V2", + } + } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "DIRECT" => Some(Self::Direct), + "DICTIONARY" => Some(Self::Dictionary), + "DIRECT_V2" => Some(Self::DirectV2), + "DICTIONARY_V2" => Some(Self::DictionaryV2), + _ => None, + } + } + } } +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct StripeEncryptionVariant { + #[prost(message, repeated, tag = "1")] + pub streams: ::prost::alloc::vec::Vec, + #[prost(message, repeated, tag = "2")] + pub encoding: ::prost::alloc::vec::Vec, +} +// each stripe looks like: +// index streams +// unencrypted +// variant 1..N +// data streams +// unencrypted +// variant 1..N +// footer + +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct StripeFooter { #[prost(message, repeated, tag = "1")] @@ -163,7 +307,26 @@ pub struct StripeFooter { pub columns: ::prost::alloc::vec::Vec, #[prost(string, optional, tag = "3")] pub writer_timezone: ::core::option::Option<::prost::alloc::string::String>, + /// one for each column encryption variant + #[prost(message, repeated, tag = "4")] + pub encryption: ::prost::alloc::vec::Vec, +} +// the file tail looks like: +// encrypted stripe statistics: ColumnarStripeStatistics (order by variant) +// stripe statistics: Metadata +// footer: Footer +// postscript: PostScript +// psLen: byte + +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct StringPair { + #[prost(string, optional, tag = "1")] + pub key: ::core::option::Option<::prost::alloc::string::String>, + #[prost(string, optional, tag = "2")] + pub value: ::core::option::Option<::prost::alloc::string::String>, } +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct Type { #[prost(enumeration = "r#type::Kind", optional, tag = "1")] @@ -178,6 +341,8 @@ pub struct Type { pub precision: ::core::option::Option, #[prost(uint32, optional, tag = "6")] pub scale: ::core::option::Option, + #[prost(message, repeated, tag = "7")] + pub attributes: ::prost::alloc::vec::Vec, } /// Nested message and enum types in `Type`. pub mod r#type { @@ -202,21 +367,98 @@ pub mod r#type { Date = 15, Varchar = 16, Char = 17, + TimestampInstant = 18, + } + impl Kind { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + Kind::Boolean => "BOOLEAN", + Kind::Byte => "BYTE", + Kind::Short => "SHORT", + Kind::Int => "INT", + Kind::Long => "LONG", + Kind::Float => "FLOAT", + Kind::Double => "DOUBLE", + Kind::String => "STRING", + Kind::Binary => "BINARY", + Kind::Timestamp => "TIMESTAMP", + Kind::List => "LIST", + Kind::Map => "MAP", + Kind::Struct => "STRUCT", + Kind::Union => "UNION", + Kind::Decimal => "DECIMAL", + Kind::Date => "DATE", + Kind::Varchar => "VARCHAR", + Kind::Char => "CHAR", + Kind::TimestampInstant => "TIMESTAMP_INSTANT", + } + } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "BOOLEAN" => Some(Self::Boolean), + "BYTE" => Some(Self::Byte), + "SHORT" => Some(Self::Short), + "INT" => Some(Self::Int), + "LONG" => Some(Self::Long), + "FLOAT" => Some(Self::Float), + "DOUBLE" => Some(Self::Double), + "STRING" => Some(Self::String), + "BINARY" => Some(Self::Binary), + "TIMESTAMP" => Some(Self::Timestamp), + "LIST" => Some(Self::List), + "MAP" => Some(Self::Map), + "STRUCT" => Some(Self::Struct), + "UNION" => Some(Self::Union), + "DECIMAL" => Some(Self::Decimal), + "DATE" => Some(Self::Date), + "VARCHAR" => Some(Self::Varchar), + "CHAR" => Some(Self::Char), + "TIMESTAMP_INSTANT" => Some(Self::TimestampInstant), + _ => None, + } + } } } +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct StripeInformation { + /// the global file offset of the start of the stripe #[prost(uint64, optional, tag = "1")] pub offset: ::core::option::Option, + /// the number of bytes of index #[prost(uint64, optional, tag = "2")] pub index_length: ::core::option::Option, + /// the number of bytes of data #[prost(uint64, optional, tag = "3")] pub data_length: ::core::option::Option, + /// the number of bytes in the stripe footer #[prost(uint64, optional, tag = "4")] pub footer_length: ::core::option::Option, + /// the number of rows in this stripe #[prost(uint64, optional, tag = "5")] pub number_of_rows: ::core::option::Option, + /// If this is present, the reader should use this value for the encryption + /// stripe id for setting the encryption IV. Otherwise, the reader should + /// use one larger than the previous stripe's encryptStripeId. + /// For unmerged ORC files, the first stripe will use 1 and the rest of the + /// stripes won't have it set. For merged files, the stripe information + /// will be copied from their original files and thus the first stripe of + /// each of the input files will reset it to 1. + /// Note that 1 was choosen, because protobuf v3 doesn't serialize + /// primitive types that are the default (eg. 0). + #[prost(uint64, optional, tag = "6")] + pub encrypt_stripe_id: ::core::option::Option, + /// For each encryption variant, the new encrypted local key to use + /// until we find a replacement. + #[prost(bytes = "vec", repeated, tag = "7")] + pub encrypted_local_keys: ::prost::alloc::vec::Vec<::prost::alloc::vec::Vec>, } +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct UserMetadataItem { #[prost(string, optional, tag = "1")] @@ -224,16 +466,105 @@ pub struct UserMetadataItem { #[prost(bytes = "vec", optional, tag = "2")] pub value: ::core::option::Option<::prost::alloc::vec::Vec>, } +/// StripeStatistics (1 per a stripe), which each contain the +/// ColumnStatistics for each column. +/// This message type is only used in ORC v0 and v1. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct StripeStatistics { #[prost(message, repeated, tag = "1")] pub col_stats: ::prost::alloc::vec::Vec, } +/// This message type is only used in ORC v0 and v1. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct Metadata { #[prost(message, repeated, tag = "1")] pub stripe_stats: ::prost::alloc::vec::Vec, } +/// In ORC v2 (and for encrypted columns in v1), each column has +/// their column statistics written separately. +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ColumnarStripeStatistics { + /// one value for each stripe in the file + #[prost(message, repeated, tag = "1")] + pub col_stats: ::prost::alloc::vec::Vec, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct FileStatistics { + #[prost(message, repeated, tag = "1")] + pub column: ::prost::alloc::vec::Vec, +} +/// How was the data masked? This isn't necessary for reading the file, but +/// is documentation about how the file was written. +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct DataMask { + /// the kind of masking, which may include third party masks + #[prost(string, optional, tag = "1")] + pub name: ::core::option::Option<::prost::alloc::string::String>, + /// parameters for the mask + #[prost(string, repeated, tag = "2")] + pub mask_parameters: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, + /// the unencrypted column roots this mask was applied to + #[prost(uint32, repeated, tag = "3")] + pub columns: ::prost::alloc::vec::Vec, +} +/// Information about the encryption keys. +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct EncryptionKey { + #[prost(string, optional, tag = "1")] + pub key_name: ::core::option::Option<::prost::alloc::string::String>, + #[prost(uint32, optional, tag = "2")] + pub key_version: ::core::option::Option, + #[prost(enumeration = "EncryptionAlgorithm", optional, tag = "3")] + pub algorithm: ::core::option::Option, +} +/// The description of an encryption variant. +/// Each variant is a single subtype that is encrypted with a single key. +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct EncryptionVariant { + /// the column id of the root + #[prost(uint32, optional, tag = "1")] + pub root: ::core::option::Option, + /// The master key that was used to encrypt the local key, referenced as + /// an index into the Encryption.key list. + #[prost(uint32, optional, tag = "2")] + pub key: ::core::option::Option, + /// the encrypted key for the file footer + #[prost(bytes = "vec", optional, tag = "3")] + pub encrypted_key: ::core::option::Option<::prost::alloc::vec::Vec>, + /// the stripe statistics for this variant + #[prost(message, repeated, tag = "4")] + pub stripe_statistics: ::prost::alloc::vec::Vec, + /// encrypted file statistics as a FileStatistics + #[prost(bytes = "vec", optional, tag = "5")] + pub file_statistics: ::core::option::Option<::prost::alloc::vec::Vec>, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct Encryption { + /// all of the masks used in this file + #[prost(message, repeated, tag = "1")] + pub mask: ::prost::alloc::vec::Vec, + /// all of the keys used in this file + #[prost(message, repeated, tag = "2")] + pub key: ::prost::alloc::vec::Vec, + /// The encrypted variants. + /// Readers should prefer the first variant that the user has access to + /// the corresponding key. If they don't have access to any of the keys, + /// they should get the unencrypted masked data. + #[prost(message, repeated, tag = "3")] + pub variants: ::prost::alloc::vec::Vec, + /// How are the local keys encrypted? + #[prost(enumeration = "KeyProviderKind", optional, tag = "4")] + pub key_provider: ::core::option::Option, +} +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct Footer { #[prost(uint64, optional, tag = "1")] @@ -252,8 +583,27 @@ pub struct Footer { pub statistics: ::prost::alloc::vec::Vec, #[prost(uint32, optional, tag = "8")] pub row_index_stride: ::core::option::Option, + /// Each implementation that writes ORC files should register for a code + /// 0 = ORC Java + /// 1 = ORC C++ + /// 2 = Presto + /// 3 = Scritchley Go from + /// 4 = Trino + #[prost(uint32, optional, tag = "9")] + pub writer: ::core::option::Option, + /// information about the encryption in this file + #[prost(message, optional, tag = "10")] + pub encryption: ::core::option::Option, + #[prost(enumeration = "CalendarKind", optional, tag = "11")] + pub calendar: ::core::option::Option, + /// informative description about the version of the software that wrote + /// the file. It is assumed to be within a given writer, so for example + /// ORC 1.7.2 = "1.7.2". It may include suffixes, such as "-SNAPSHOT". + #[prost(string, optional, tag = "12")] + pub software_version: ::core::option::Option<::prost::alloc::string::String>, } /// Serialized length must be less that 255 bytes +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct PostScript { #[prost(uint64, optional, tag = "1")] @@ -263,21 +613,165 @@ pub struct PostScript { #[prost(uint64, optional, tag = "3")] pub compression_block_size: ::core::option::Option, /// the version of the file format - /// [0, 11] = Hive 0.11 - /// [0, 12] = Hive 0.12 + /// \[0, 11\] = Hive 0.11 + /// \[0, 12\] = Hive 0.12 #[prost(uint32, repeated, tag = "4")] pub version: ::prost::alloc::vec::Vec, #[prost(uint64, optional, tag = "5")] pub metadata_length: ::core::option::Option, - /// Version of the writer: - /// 0 (or missing) = original - /// 1 = HIVE-8732 fixed + /// The version of the writer that wrote the file. This number is + /// updated when we make fixes or large changes to the writer so that + /// readers can detect whether a given bug is present in the data. + /// + /// Only the Java ORC writer may use values under 6 (or missing) so that + /// readers that predate ORC-202 treat the new writers correctly. Each + /// writer should assign their own sequence of versions starting from 6. + /// + /// Version of the ORC Java writer: + /// 0 = original + /// 1 = HIVE-8732 fixed (fixed stripe/file maximum statistics & + /// string statistics use utf8 for min/max) + /// 2 = HIVE-4243 fixed (use real column names from Hive tables) + /// 3 = HIVE-12055 added (vectorized writer implementation) + /// 4 = HIVE-13083 fixed (decimals write present stream correctly) + /// 5 = ORC-101 fixed (bloom filters use utf8 consistently) + /// 6 = ORC-135 fixed (timestamp statistics use utc) + /// 7 = ORC-517 fixed (decimal64 min/max incorrect) + /// 8 = ORC-203 added (trim very long string statistics) + /// 9 = ORC-14 added (column encryption) + /// + /// Version of the ORC C++ writer: + /// 6 = original + /// + /// Version of the Presto writer: + /// 6 = original + /// + /// Version of the Scritchley Go writer: + /// 6 = original + /// + /// Version of the Trino writer: + /// 6 = original + /// #[prost(uint32, optional, tag = "6")] pub writer_version: ::core::option::Option, + /// the number of bytes in the encrypted stripe statistics + #[prost(uint64, optional, tag = "7")] + pub stripe_statistics_length: ::core::option::Option, /// Leave this last in the record #[prost(string, optional, tag = "8000")] pub magic: ::core::option::Option<::prost::alloc::string::String>, } +/// The contents of the file tail that must be serialized. +/// This gets serialized as part of OrcSplit, also used by footer cache. +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct FileTail { + #[prost(message, optional, tag = "1")] + pub postscript: ::core::option::Option, + #[prost(message, optional, tag = "2")] + pub footer: ::core::option::Option