From e5e96ec60184968fb3b7a571f258083895f2717f Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 26 May 2023 10:42:52 -0300 Subject: [PATCH 01/69] able to read the file --- cpp/CMakePresets.json | 7 +- cpp/diff.output | 359 ++++++++++++++++++ cpp/examples/arrow/parquet_read_write.cc | 2 +- .../parquet/parquet_arrow/reader_writer.cc | 30 +- cpp/src/arrow/array/builder_dict.h | 24 ++ cpp/src/arrow/type.h | 2 +- cpp/src/parquet/arrow/reader_internal.cc | 3 +- cpp/src/parquet/arrow/schema_internal.cc | 2 +- cpp/src/parquet/column_reader.cc | 6 +- cpp/src/parquet/column_reader.h | 5 + cpp/src/parquet/encoding.cc | 6 +- cpp/src/parquet/encoding.h | 8 +- 12 files changed, 426 insertions(+), 28 deletions(-) create mode 100644 cpp/diff.output diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json index 7882be57a0534..40ccd64a93695 100644 --- a/cpp/CMakePresets.json +++ b/cpp/CMakePresets.json @@ -220,7 +220,12 @@ "features-main" ], "displayName": "Debug build with tests and more optional components", - "cacheVariables": {} + "cacheVariables": { + "ARROW_BUILD_EXAMPLES": "ON", + "PARQUET_BUILD_EXAMPLES": "ON", + "ARROW_BUILD_TESTS": "ON", + "ARROW_BUILD_UTILITIES": "ON" + } }, { "name": "ninja-debug-cuda", diff --git a/cpp/diff.output b/cpp/diff.output new file mode 100644 index 0000000000000..3030a9aba673c --- /dev/null +++ b/cpp/diff.output @@ -0,0 +1,359 @@ +diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json +index 7882be57a..40ccd64a9 100644 +--- a/cpp/CMakePresets.json ++++ b/cpp/CMakePresets.json +@@ -220,7 +220,12 @@ + "features-main" + ], + "displayName": "Debug build with tests and more optional components", +- "cacheVariables": {} ++ "cacheVariables": { ++ "ARROW_BUILD_EXAMPLES": "ON", ++ "PARQUET_BUILD_EXAMPLES": "ON", ++ "ARROW_BUILD_TESTS": "ON", ++ "ARROW_BUILD_UTILITIES": "ON" ++ } + }, + { + "name": "ninja-debug-cuda", +diff --git a/cpp/examples/arrow/parquet_read_write.cc b/cpp/examples/arrow/parquet_read_write.cc +index 3b8b4c221..20fe2c20b 100644 +--- a/cpp/examples/arrow/parquet_read_write.cc ++++ b/cpp/examples/arrow/parquet_read_write.cc +@@ -120,7 +120,7 @@ arrow::Status WriteFullFile(std::string path_to_file) { + + ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(), + arrow::default_memory_pool(), outfile, +- /*chunk_size=*/3, props, arrow_props)); ++ /*chunk_size=*/1024*1024*1024, props, arrow_props)); + return arrow::Status::OK(); + } + +diff --git a/cpp/examples/parquet/parquet_arrow/reader_writer.cc b/cpp/examples/parquet/parquet_arrow/reader_writer.cc +index f5d96ec16..b4e28c662 100644 +--- a/cpp/examples/parquet/parquet_arrow/reader_writer.cc ++++ b/cpp/examples/parquet/parquet_arrow/reader_writer.cc +@@ -56,20 +56,22 @@ void write_parquet_file(const arrow::Table& table) { + // the parquet file. Normally you would choose this to be rather large but + // for the example, we use a small value to have multiple RowGroups. + PARQUET_THROW_NOT_OK( +- parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 3)); ++ parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 1024*1024 * 1024)); + } + + // #2: Fully read in the file +-void read_whole_file() { +- std::cout << "Reading parquet-arrow-example.parquet at once" << std::endl; ++void read_whole_file(const std::string & filename) { ++ std::cout << "Reading " << filename << " at once" << std::endl; + std::shared_ptr infile; + PARQUET_ASSIGN_OR_THROW(infile, +- arrow::io::ReadableFile::Open("parquet-arrow-example.parquet", ++ arrow::io::ReadableFile::Open(filename, + arrow::default_memory_pool())); + + std::unique_ptr reader; + PARQUET_THROW_NOT_OK( + parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); ++ ++ + std::shared_ptr table; + PARQUET_THROW_NOT_OK(reader->ReadTable(&table)); + std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns() +@@ -94,18 +96,18 @@ void read_single_rowgroup() { + } + + // #4: Read only a single column of the whole parquet file +-void read_single_column() { +- std::cout << "Reading first column of parquet-arrow-example.parquet" << std::endl; ++void read_single_column(const std::string & filename) { ++ std::cout << "Reading first column of " << filename << std::endl; + std::shared_ptr infile; + PARQUET_ASSIGN_OR_THROW(infile, +- arrow::io::ReadableFile::Open("parquet-arrow-example.parquet", ++ arrow::io::ReadableFile::Open(filename, + arrow::default_memory_pool())); + + std::unique_ptr reader; + PARQUET_THROW_NOT_OK( + parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); + std::shared_ptr array; +- PARQUET_THROW_NOT_OK(reader->ReadColumn(0, &array)); ++ PARQUET_THROW_NOT_OK(reader->ReadColumn(5, &array)); + PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout)); + std::cout << std::endl; + } +@@ -131,10 +133,10 @@ void read_single_column_chunk() { + } + + int main(int argc, char** argv) { +- std::shared_ptr table = generate_table(); +- write_parquet_file(*table); +- read_whole_file(); +- read_single_rowgroup(); +- read_single_column(); +- read_single_column_chunk(); ++// std::shared_ptr table = generate_table(); ++// write_parquet_file(*table); ++ read_whole_file("minimal_repro.parquet"); ++// read_single_rowgroup(); ++// read_single_column("minimal_repro.parquet"); ++// read_single_column_chunk(); + } +diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc +index 745312f1d..3a5184d1d 100644 +--- a/cpp/src/arrow/array/array_nested.cc ++++ b/cpp/src/arrow/array/array_nested.cc +@@ -207,8 +207,8 @@ inline void SetListData(BaseListArray* self, const std::shared_ptrraw_value_offsets_ = + data->GetValuesSafe(1, /*offset=*/0); + +- ARROW_CHECK_EQ(self->list_type_->value_type()->id(), data->child_data[0]->type->id()); +- DCHECK(self->list_type_->value_type()->Equals(data->child_data[0]->type)); ++// ARROW_CHECK_EQ(self->list_type_->value_type()->id(), data->child_data[0]->type->id()); ++// DCHECK(self->list_type_->value_type()->Equals(data->child_data[0]->type)); + self->values_ = MakeArray(self->data_->child_data[0]); + } + +diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc +index 571f450aa..9754275e7 100644 +--- a/cpp/src/arrow/array/builder_binary.cc ++++ b/cpp/src/arrow/array/builder_binary.cc +@@ -137,6 +137,7 @@ namespace internal { + ChunkedBinaryBuilder::ChunkedBinaryBuilder(int32_t max_chunk_value_length, + MemoryPool* pool) + : max_chunk_value_length_(max_chunk_value_length), builder_(new BinaryBuilder(pool)) { ++ assert(false); + DCHECK_LE(max_chunk_value_length, kBinaryMemoryLimit); + } + +diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h +index cb0aaf309..9a248dc6f 100644 +--- a/cpp/src/arrow/array/builder_dict.h ++++ b/cpp/src/arrow/array/builder_dict.h +@@ -715,6 +715,29 @@ class Dictionary32Builder : public internal::DictionaryBuilderBase ++class Dictionary64Builder : public internal::DictionaryBuilderBase { ++ public: ++ using BASE = internal::DictionaryBuilderBase; ++ using BASE::BASE; ++ ++ /// \brief Append dictionary indices directly without modifying memo ++ /// ++ /// NOTE: Experimental API ++ Status AppendIndices(const int64_t* values, int64_t length, ++ const uint8_t* valid_bytes = NULLPTR) { ++ int64_t null_count_before = this->indices_builder_.null_count(); ++ ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes)); ++ this->capacity_ = this->indices_builder_.capacity(); ++ this->length_ += length; ++ this->null_count_ += this->indices_builder_.null_count() - null_count_before; ++ return Status::OK(); ++ } ++}; ++ + // ---------------------------------------------------------------------- + // Binary / Unicode builders + // (compatibility aliases; those used to be derived classes with additional +@@ -724,6 +747,7 @@ using BinaryDictionaryBuilder = DictionaryBuilder; + using StringDictionaryBuilder = DictionaryBuilder; + using BinaryDictionary32Builder = Dictionary32Builder; + using StringDictionary32Builder = Dictionary32Builder; ++using BinaryDictionary64Builder = Dictionary64Builder; + + /// @} + +diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc +index 0f2bd4583..7fc907986 100644 +--- a/cpp/src/arrow/array/validate.cc ++++ b/cpp/src/arrow/array/validate.cc +@@ -298,12 +298,12 @@ struct ValidateArrayImpl { + field_data.length, " < ", data.length + data.offset, ")"); + } + +- const auto& field_type = type.field(i)->type(); +- if (!field_data.type->Equals(*field_type)) { +- return Status::Invalid("Struct child array #", i, " does not match type field: ", +- field_data.type->ToString(), " vs ", +- field_type->ToString()); +- } ++// const auto& field_type = type.field(i)->type(); ++// if (!field_data.type->Equals(*field_type)) { ++// return Status::Invalid("Struct child array #", i, " does not match type field: ", ++// field_data.type->ToString(), " vs ", ++// field_type->ToString()); ++// } + } + return Status::OK(); + } +diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h +index 48228d43e..73c1a9d44 100644 +--- a/cpp/src/arrow/type.h ++++ b/cpp/src/arrow/type.h +@@ -676,7 +676,7 @@ class ARROW_EXPORT BaseBinaryType : public DataType { + ~BaseBinaryType() override; + }; + +-constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max() - 1; ++constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max() - 1; + + /// \addtogroup binary-datatypes + /// +diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc +index 40fbdcbb5..f7b31b67f 100644 +--- a/cpp/src/parquet/arrow/reader.cc ++++ b/cpp/src/parquet/arrow/reader.cc +@@ -90,6 +90,8 @@ namespace { + case 1: + return chunked.chunk(0)->data(); + default: ++// auto flattened = chunked.Flatten().ValueOrDie(); ++// return flattened[0]->chunk(0)->data(); + // ARROW-3762(wesm): If item reader yields a chunked array, we reject as + // this is not yet implemented + return Status::NotImplemented( +diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc +index a294b712a..3785eac26 100644 +--- a/cpp/src/parquet/arrow/reader_internal.cc ++++ b/cpp/src/parquet/arrow/reader_internal.cc +@@ -85,6 +85,7 @@ using ::arrow::internal::SafeLeftShift; + using ::arrow::util::SafeLoadAs; + + using parquet::internal::BinaryRecordReader; ++using parquet::internal::LargeBinaryRecordReader; + using parquet::internal::DictionaryRecordReader; + using parquet::internal::RecordReader; + using parquet::schema::GroupNode; +@@ -482,7 +483,7 @@ Status TransferBinary(RecordReader* reader, MemoryPool* pool, + ::arrow::compute::CastOptions cast_options; + cast_options.allow_invalid_utf8 = true; // avoid spending time validating UTF8 data + +- auto binary_reader = dynamic_cast(reader); ++ auto binary_reader = dynamic_cast(reader); + DCHECK(binary_reader); + auto chunks = binary_reader->GetBuilderChunks(); + for (auto& chunk : chunks) { +diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc +index 064bf4f55..dbff14d93 100644 +--- a/cpp/src/parquet/arrow/schema_internal.cc ++++ b/cpp/src/parquet/arrow/schema_internal.cc +@@ -113,7 +113,7 @@ Result> MakeArrowTimestamp(const LogicalType& logical + Result> FromByteArray(const LogicalType& logical_type) { + switch (logical_type.type()) { + case LogicalType::Type::STRING: +- return ::arrow::utf8(); ++ return ::arrow::large_utf8(); + case LogicalType::Type::DECIMAL: + return MakeArrowDecimal(logical_type); + case LogicalType::Type::NONE: +diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc +index 3294aaaf2..3cfdb6cb8 100644 +--- a/cpp/src/parquet/column_reader.cc ++++ b/cpp/src/parquet/column_reader.cc +@@ -2094,14 +2094,14 @@ class FLBARecordReader : public TypedRecordReader, + }; + + class ByteArrayChunkedRecordReader : public TypedRecordReader, +- virtual public BinaryRecordReader { ++ virtual public LargeBinaryRecordReader { + public: + ByteArrayChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info, + ::arrow::MemoryPool* pool, bool read_dense_for_nullable) + : TypedRecordReader(descr, leaf_info, pool, + read_dense_for_nullable) { + ARROW_DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY); +- accumulator_.builder = std::make_unique<::arrow::BinaryBuilder>(pool); ++ accumulator_.builder = std::make_unique<::arrow::LargeBinaryBuilder>(pool); + } + + ::arrow::ArrayVector GetBuilderChunks() override { +@@ -2213,7 +2213,7 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader, + private: + using BinaryDictDecoder = DictDecoder; + +- ::arrow::BinaryDictionary32Builder builder_; ++ ::arrow::BinaryDictionary64Builder builder_; + std::vector> result_chunks_; + }; + +diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h +index 334b8bcff..b652a89d8 100644 +--- a/cpp/src/parquet/column_reader.h ++++ b/cpp/src/parquet/column_reader.h +@@ -470,6 +470,11 @@ class BinaryRecordReader : virtual public RecordReader { + virtual std::vector> GetBuilderChunks() = 0; + }; + ++class LargeBinaryRecordReader : virtual public RecordReader { ++ public: ++ virtual std::vector> GetBuilderChunks() = 0; ++}; ++ + /// \brief Read records directly to dictionary-encoded Arrow form (int32 + /// indices). Only valid for BYTE_ARRAY columns + class DictionaryRecordReader : virtual public RecordReader { +diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc +index 134a22f28..b52cd3b30 100644 +--- a/cpp/src/parquet/encoding.cc ++++ b/cpp/src/parquet/encoding.cc +@@ -1271,7 +1271,7 @@ struct ArrowBinaryHelper { + Status AppendNull() { return builder->AppendNull(); } + + typename EncodingTraits::Accumulator* out; +- ::arrow::BinaryBuilder* builder; ++ ::arrow::LargeBinaryBuilder* builder; + int64_t chunk_space_remaining; + }; + +@@ -1349,7 +1349,7 @@ class PlainByteArrayDecoder : public PlainDecoder, + + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, +- ::arrow::BinaryDictionary32Builder* builder) override { ++ ::arrow::BinaryDictionary64Builder* builder) override { + int result = 0; + PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, + valid_bits_offset, builder, &result)); +@@ -1862,7 +1862,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, + + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, +- ::arrow::BinaryDictionary32Builder* builder) override { ++ ::arrow::BinaryDictionary64Builder* builder) override { + int result = 0; + if (null_count == 0) { + PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result)); +diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h +index 9f9b740ff..ab80284e6 100644 +--- a/cpp/src/parquet/encoding.h ++++ b/cpp/src/parquet/encoding.h +@@ -45,6 +45,8 @@ class NumericBuilder; + class FixedSizeBinaryBuilder; + template + class Dictionary32Builder; ++template ++class Dictionary64Builder; + + } // namespace arrow + +@@ -144,11 +146,11 @@ struct EncodingTraits { + /// \brief Internal helper class for decoding BYTE_ARRAY data where we can + /// overflow the capacity of a single arrow::BinaryArray + struct Accumulator { +- std::unique_ptr<::arrow::BinaryBuilder> builder; ++ std::unique_ptr<::arrow::LargeBinaryBuilder> builder; + std::vector> chunks; + }; +- using ArrowType = ::arrow::BinaryType; +- using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>; ++ using ArrowType = ::arrow::LargeBinaryType; ++ using DictAccumulator = ::arrow::Dictionary64Builder<::arrow::LargeBinaryType>; + }; + + template <> diff --git a/cpp/examples/arrow/parquet_read_write.cc b/cpp/examples/arrow/parquet_read_write.cc index 3b8b4c2212b75..20fe2c20b291a 100644 --- a/cpp/examples/arrow/parquet_read_write.cc +++ b/cpp/examples/arrow/parquet_read_write.cc @@ -120,7 +120,7 @@ arrow::Status WriteFullFile(std::string path_to_file) { ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(), arrow::default_memory_pool(), outfile, - /*chunk_size=*/3, props, arrow_props)); + /*chunk_size=*/1024*1024*1024, props, arrow_props)); return arrow::Status::OK(); } diff --git a/cpp/examples/parquet/parquet_arrow/reader_writer.cc b/cpp/examples/parquet/parquet_arrow/reader_writer.cc index f5d96ec16ca64..b4e28c662a88e 100644 --- a/cpp/examples/parquet/parquet_arrow/reader_writer.cc +++ b/cpp/examples/parquet/parquet_arrow/reader_writer.cc @@ -56,20 +56,22 @@ void write_parquet_file(const arrow::Table& table) { // the parquet file. Normally you would choose this to be rather large but // for the example, we use a small value to have multiple RowGroups. PARQUET_THROW_NOT_OK( - parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 3)); + parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 1024*1024 * 1024)); } // #2: Fully read in the file -void read_whole_file() { - std::cout << "Reading parquet-arrow-example.parquet at once" << std::endl; +void read_whole_file(const std::string & filename) { + std::cout << "Reading " << filename << " at once" << std::endl; std::shared_ptr infile; PARQUET_ASSIGN_OR_THROW(infile, - arrow::io::ReadableFile::Open("parquet-arrow-example.parquet", + arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool())); std::unique_ptr reader; PARQUET_THROW_NOT_OK( parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); + + std::shared_ptr table; PARQUET_THROW_NOT_OK(reader->ReadTable(&table)); std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns() @@ -94,18 +96,18 @@ void read_single_rowgroup() { } // #4: Read only a single column of the whole parquet file -void read_single_column() { - std::cout << "Reading first column of parquet-arrow-example.parquet" << std::endl; +void read_single_column(const std::string & filename) { + std::cout << "Reading first column of " << filename << std::endl; std::shared_ptr infile; PARQUET_ASSIGN_OR_THROW(infile, - arrow::io::ReadableFile::Open("parquet-arrow-example.parquet", + arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool())); std::unique_ptr reader; PARQUET_THROW_NOT_OK( parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); std::shared_ptr array; - PARQUET_THROW_NOT_OK(reader->ReadColumn(0, &array)); + PARQUET_THROW_NOT_OK(reader->ReadColumn(5, &array)); PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout)); std::cout << std::endl; } @@ -131,10 +133,10 @@ void read_single_column_chunk() { } int main(int argc, char** argv) { - std::shared_ptr table = generate_table(); - write_parquet_file(*table); - read_whole_file(); - read_single_rowgroup(); - read_single_column(); - read_single_column_chunk(); +// std::shared_ptr table = generate_table(); +// write_parquet_file(*table); + read_whole_file("minimal_repro.parquet"); +// read_single_rowgroup(); +// read_single_column("minimal_repro.parquet"); +// read_single_column_chunk(); } diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index cb0aaf309915b..9a248dc6fe393 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -715,6 +715,29 @@ class Dictionary32Builder : public internal::DictionaryBuilderBase +class Dictionary64Builder : public internal::DictionaryBuilderBase { + public: + using BASE = internal::DictionaryBuilderBase; + using BASE::BASE; + + /// \brief Append dictionary indices directly without modifying memo + /// + /// NOTE: Experimental API + Status AppendIndices(const int64_t* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR) { + int64_t null_count_before = this->indices_builder_.null_count(); + ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes)); + this->capacity_ = this->indices_builder_.capacity(); + this->length_ += length; + this->null_count_ += this->indices_builder_.null_count() - null_count_before; + return Status::OK(); + } +}; + // ---------------------------------------------------------------------- // Binary / Unicode builders // (compatibility aliases; those used to be derived classes with additional @@ -724,6 +747,7 @@ using BinaryDictionaryBuilder = DictionaryBuilder; using StringDictionaryBuilder = DictionaryBuilder; using BinaryDictionary32Builder = Dictionary32Builder; using StringDictionary32Builder = Dictionary32Builder; +using BinaryDictionary64Builder = Dictionary64Builder; /// @} diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 48228d43ef932..73c1a9d445398 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -676,7 +676,7 @@ class ARROW_EXPORT BaseBinaryType : public DataType { ~BaseBinaryType() override; }; -constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max() - 1; +constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max() - 1; /// \addtogroup binary-datatypes /// diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index a294b712a7ce3..3785eac26b284 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -85,6 +85,7 @@ using ::arrow::internal::SafeLeftShift; using ::arrow::util::SafeLoadAs; using parquet::internal::BinaryRecordReader; +using parquet::internal::LargeBinaryRecordReader; using parquet::internal::DictionaryRecordReader; using parquet::internal::RecordReader; using parquet::schema::GroupNode; @@ -482,7 +483,7 @@ Status TransferBinary(RecordReader* reader, MemoryPool* pool, ::arrow::compute::CastOptions cast_options; cast_options.allow_invalid_utf8 = true; // avoid spending time validating UTF8 data - auto binary_reader = dynamic_cast(reader); + auto binary_reader = dynamic_cast(reader); DCHECK(binary_reader); auto chunks = binary_reader->GetBuilderChunks(); for (auto& chunk : chunks) { diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc index 064bf4f55cc7e..dbff14d93b84e 100644 --- a/cpp/src/parquet/arrow/schema_internal.cc +++ b/cpp/src/parquet/arrow/schema_internal.cc @@ -113,7 +113,7 @@ Result> MakeArrowTimestamp(const LogicalType& logical Result> FromByteArray(const LogicalType& logical_type) { switch (logical_type.type()) { case LogicalType::Type::STRING: - return ::arrow::utf8(); + return ::arrow::large_utf8(); case LogicalType::Type::DECIMAL: return MakeArrowDecimal(logical_type); case LogicalType::Type::NONE: diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 3294aaaf283f1..3cfdb6cb83ca1 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -2094,14 +2094,14 @@ class FLBARecordReader : public TypedRecordReader, }; class ByteArrayChunkedRecordReader : public TypedRecordReader, - virtual public BinaryRecordReader { + virtual public LargeBinaryRecordReader { public: ByteArrayChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info, ::arrow::MemoryPool* pool, bool read_dense_for_nullable) : TypedRecordReader(descr, leaf_info, pool, read_dense_for_nullable) { ARROW_DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY); - accumulator_.builder = std::make_unique<::arrow::BinaryBuilder>(pool); + accumulator_.builder = std::make_unique<::arrow::LargeBinaryBuilder>(pool); } ::arrow::ArrayVector GetBuilderChunks() override { @@ -2213,7 +2213,7 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader, private: using BinaryDictDecoder = DictDecoder; - ::arrow::BinaryDictionary32Builder builder_; + ::arrow::BinaryDictionary64Builder builder_; std::vector> result_chunks_; }; diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 334b8bcffe0b8..b652a89d8cf3b 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -470,6 +470,11 @@ class BinaryRecordReader : virtual public RecordReader { virtual std::vector> GetBuilderChunks() = 0; }; +class LargeBinaryRecordReader : virtual public RecordReader { + public: + virtual std::vector> GetBuilderChunks() = 0; +}; + /// \brief Read records directly to dictionary-encoded Arrow form (int32 /// indices). Only valid for BYTE_ARRAY columns class DictionaryRecordReader : virtual public RecordReader { diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 134a22f28412b..b52cd3b303c29 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1271,7 +1271,7 @@ struct ArrowBinaryHelper { Status AppendNull() { return builder->AppendNull(); } typename EncodingTraits::Accumulator* out; - ::arrow::BinaryBuilder* builder; + ::arrow::LargeBinaryBuilder* builder; int64_t chunk_space_remaining; }; @@ -1349,7 +1349,7 @@ class PlainByteArrayDecoder : public PlainDecoder, int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - ::arrow::BinaryDictionary32Builder* builder) override { + ::arrow::BinaryDictionary64Builder* builder) override { int result = 0; PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, valid_bits_offset, builder, &result)); @@ -1862,7 +1862,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - ::arrow::BinaryDictionary32Builder* builder) override { + ::arrow::BinaryDictionary64Builder* builder) override { int result = 0; if (null_count == 0) { PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result)); diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index 9f9b740ff3424..ab80284e6f83b 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -45,6 +45,8 @@ class NumericBuilder; class FixedSizeBinaryBuilder; template class Dictionary32Builder; +template +class Dictionary64Builder; } // namespace arrow @@ -144,11 +146,11 @@ struct EncodingTraits { /// \brief Internal helper class for decoding BYTE_ARRAY data where we can /// overflow the capacity of a single arrow::BinaryArray struct Accumulator { - std::unique_ptr<::arrow::BinaryBuilder> builder; + std::unique_ptr<::arrow::LargeBinaryBuilder> builder; std::vector> chunks; }; - using ArrowType = ::arrow::BinaryType; - using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>; + using ArrowType = ::arrow::LargeBinaryType; + using DictAccumulator = ::arrow::Dictionary64Builder<::arrow::LargeBinaryType>; }; template <> From b9b48f8eb5a61d15e5cec39d175deaf519f31654 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 26 May 2023 10:46:12 -0300 Subject: [PATCH 02/69] remove diff out --- cpp/diff.output | 359 ------------------------------------------------ 1 file changed, 359 deletions(-) delete mode 100644 cpp/diff.output diff --git a/cpp/diff.output b/cpp/diff.output deleted file mode 100644 index 3030a9aba673c..0000000000000 --- a/cpp/diff.output +++ /dev/null @@ -1,359 +0,0 @@ -diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json -index 7882be57a..40ccd64a9 100644 ---- a/cpp/CMakePresets.json -+++ b/cpp/CMakePresets.json -@@ -220,7 +220,12 @@ - "features-main" - ], - "displayName": "Debug build with tests and more optional components", -- "cacheVariables": {} -+ "cacheVariables": { -+ "ARROW_BUILD_EXAMPLES": "ON", -+ "PARQUET_BUILD_EXAMPLES": "ON", -+ "ARROW_BUILD_TESTS": "ON", -+ "ARROW_BUILD_UTILITIES": "ON" -+ } - }, - { - "name": "ninja-debug-cuda", -diff --git a/cpp/examples/arrow/parquet_read_write.cc b/cpp/examples/arrow/parquet_read_write.cc -index 3b8b4c221..20fe2c20b 100644 ---- a/cpp/examples/arrow/parquet_read_write.cc -+++ b/cpp/examples/arrow/parquet_read_write.cc -@@ -120,7 +120,7 @@ arrow::Status WriteFullFile(std::string path_to_file) { - - ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(), - arrow::default_memory_pool(), outfile, -- /*chunk_size=*/3, props, arrow_props)); -+ /*chunk_size=*/1024*1024*1024, props, arrow_props)); - return arrow::Status::OK(); - } - -diff --git a/cpp/examples/parquet/parquet_arrow/reader_writer.cc b/cpp/examples/parquet/parquet_arrow/reader_writer.cc -index f5d96ec16..b4e28c662 100644 ---- a/cpp/examples/parquet/parquet_arrow/reader_writer.cc -+++ b/cpp/examples/parquet/parquet_arrow/reader_writer.cc -@@ -56,20 +56,22 @@ void write_parquet_file(const arrow::Table& table) { - // the parquet file. Normally you would choose this to be rather large but - // for the example, we use a small value to have multiple RowGroups. - PARQUET_THROW_NOT_OK( -- parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 3)); -+ parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 1024*1024 * 1024)); - } - - // #2: Fully read in the file --void read_whole_file() { -- std::cout << "Reading parquet-arrow-example.parquet at once" << std::endl; -+void read_whole_file(const std::string & filename) { -+ std::cout << "Reading " << filename << " at once" << std::endl; - std::shared_ptr infile; - PARQUET_ASSIGN_OR_THROW(infile, -- arrow::io::ReadableFile::Open("parquet-arrow-example.parquet", -+ arrow::io::ReadableFile::Open(filename, - arrow::default_memory_pool())); - - std::unique_ptr reader; - PARQUET_THROW_NOT_OK( - parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); -+ -+ - std::shared_ptr table; - PARQUET_THROW_NOT_OK(reader->ReadTable(&table)); - std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns() -@@ -94,18 +96,18 @@ void read_single_rowgroup() { - } - - // #4: Read only a single column of the whole parquet file --void read_single_column() { -- std::cout << "Reading first column of parquet-arrow-example.parquet" << std::endl; -+void read_single_column(const std::string & filename) { -+ std::cout << "Reading first column of " << filename << std::endl; - std::shared_ptr infile; - PARQUET_ASSIGN_OR_THROW(infile, -- arrow::io::ReadableFile::Open("parquet-arrow-example.parquet", -+ arrow::io::ReadableFile::Open(filename, - arrow::default_memory_pool())); - - std::unique_ptr reader; - PARQUET_THROW_NOT_OK( - parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); - std::shared_ptr array; -- PARQUET_THROW_NOT_OK(reader->ReadColumn(0, &array)); -+ PARQUET_THROW_NOT_OK(reader->ReadColumn(5, &array)); - PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout)); - std::cout << std::endl; - } -@@ -131,10 +133,10 @@ void read_single_column_chunk() { - } - - int main(int argc, char** argv) { -- std::shared_ptr table = generate_table(); -- write_parquet_file(*table); -- read_whole_file(); -- read_single_rowgroup(); -- read_single_column(); -- read_single_column_chunk(); -+// std::shared_ptr table = generate_table(); -+// write_parquet_file(*table); -+ read_whole_file("minimal_repro.parquet"); -+// read_single_rowgroup(); -+// read_single_column("minimal_repro.parquet"); -+// read_single_column_chunk(); - } -diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc -index 745312f1d..3a5184d1d 100644 ---- a/cpp/src/arrow/array/array_nested.cc -+++ b/cpp/src/arrow/array/array_nested.cc -@@ -207,8 +207,8 @@ inline void SetListData(BaseListArray* self, const std::shared_ptrraw_value_offsets_ = - data->GetValuesSafe(1, /*offset=*/0); - -- ARROW_CHECK_EQ(self->list_type_->value_type()->id(), data->child_data[0]->type->id()); -- DCHECK(self->list_type_->value_type()->Equals(data->child_data[0]->type)); -+// ARROW_CHECK_EQ(self->list_type_->value_type()->id(), data->child_data[0]->type->id()); -+// DCHECK(self->list_type_->value_type()->Equals(data->child_data[0]->type)); - self->values_ = MakeArray(self->data_->child_data[0]); - } - -diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc -index 571f450aa..9754275e7 100644 ---- a/cpp/src/arrow/array/builder_binary.cc -+++ b/cpp/src/arrow/array/builder_binary.cc -@@ -137,6 +137,7 @@ namespace internal { - ChunkedBinaryBuilder::ChunkedBinaryBuilder(int32_t max_chunk_value_length, - MemoryPool* pool) - : max_chunk_value_length_(max_chunk_value_length), builder_(new BinaryBuilder(pool)) { -+ assert(false); - DCHECK_LE(max_chunk_value_length, kBinaryMemoryLimit); - } - -diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h -index cb0aaf309..9a248dc6f 100644 ---- a/cpp/src/arrow/array/builder_dict.h -+++ b/cpp/src/arrow/array/builder_dict.h -@@ -715,6 +715,29 @@ class Dictionary32Builder : public internal::DictionaryBuilderBase -+class Dictionary64Builder : public internal::DictionaryBuilderBase { -+ public: -+ using BASE = internal::DictionaryBuilderBase; -+ using BASE::BASE; -+ -+ /// \brief Append dictionary indices directly without modifying memo -+ /// -+ /// NOTE: Experimental API -+ Status AppendIndices(const int64_t* values, int64_t length, -+ const uint8_t* valid_bytes = NULLPTR) { -+ int64_t null_count_before = this->indices_builder_.null_count(); -+ ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes)); -+ this->capacity_ = this->indices_builder_.capacity(); -+ this->length_ += length; -+ this->null_count_ += this->indices_builder_.null_count() - null_count_before; -+ return Status::OK(); -+ } -+}; -+ - // ---------------------------------------------------------------------- - // Binary / Unicode builders - // (compatibility aliases; those used to be derived classes with additional -@@ -724,6 +747,7 @@ using BinaryDictionaryBuilder = DictionaryBuilder; - using StringDictionaryBuilder = DictionaryBuilder; - using BinaryDictionary32Builder = Dictionary32Builder; - using StringDictionary32Builder = Dictionary32Builder; -+using BinaryDictionary64Builder = Dictionary64Builder; - - /// @} - -diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc -index 0f2bd4583..7fc907986 100644 ---- a/cpp/src/arrow/array/validate.cc -+++ b/cpp/src/arrow/array/validate.cc -@@ -298,12 +298,12 @@ struct ValidateArrayImpl { - field_data.length, " < ", data.length + data.offset, ")"); - } - -- const auto& field_type = type.field(i)->type(); -- if (!field_data.type->Equals(*field_type)) { -- return Status::Invalid("Struct child array #", i, " does not match type field: ", -- field_data.type->ToString(), " vs ", -- field_type->ToString()); -- } -+// const auto& field_type = type.field(i)->type(); -+// if (!field_data.type->Equals(*field_type)) { -+// return Status::Invalid("Struct child array #", i, " does not match type field: ", -+// field_data.type->ToString(), " vs ", -+// field_type->ToString()); -+// } - } - return Status::OK(); - } -diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h -index 48228d43e..73c1a9d44 100644 ---- a/cpp/src/arrow/type.h -+++ b/cpp/src/arrow/type.h -@@ -676,7 +676,7 @@ class ARROW_EXPORT BaseBinaryType : public DataType { - ~BaseBinaryType() override; - }; - --constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max() - 1; -+constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max() - 1; - - /// \addtogroup binary-datatypes - /// -diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc -index 40fbdcbb5..f7b31b67f 100644 ---- a/cpp/src/parquet/arrow/reader.cc -+++ b/cpp/src/parquet/arrow/reader.cc -@@ -90,6 +90,8 @@ namespace { - case 1: - return chunked.chunk(0)->data(); - default: -+// auto flattened = chunked.Flatten().ValueOrDie(); -+// return flattened[0]->chunk(0)->data(); - // ARROW-3762(wesm): If item reader yields a chunked array, we reject as - // this is not yet implemented - return Status::NotImplemented( -diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc -index a294b712a..3785eac26 100644 ---- a/cpp/src/parquet/arrow/reader_internal.cc -+++ b/cpp/src/parquet/arrow/reader_internal.cc -@@ -85,6 +85,7 @@ using ::arrow::internal::SafeLeftShift; - using ::arrow::util::SafeLoadAs; - - using parquet::internal::BinaryRecordReader; -+using parquet::internal::LargeBinaryRecordReader; - using parquet::internal::DictionaryRecordReader; - using parquet::internal::RecordReader; - using parquet::schema::GroupNode; -@@ -482,7 +483,7 @@ Status TransferBinary(RecordReader* reader, MemoryPool* pool, - ::arrow::compute::CastOptions cast_options; - cast_options.allow_invalid_utf8 = true; // avoid spending time validating UTF8 data - -- auto binary_reader = dynamic_cast(reader); -+ auto binary_reader = dynamic_cast(reader); - DCHECK(binary_reader); - auto chunks = binary_reader->GetBuilderChunks(); - for (auto& chunk : chunks) { -diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc -index 064bf4f55..dbff14d93 100644 ---- a/cpp/src/parquet/arrow/schema_internal.cc -+++ b/cpp/src/parquet/arrow/schema_internal.cc -@@ -113,7 +113,7 @@ Result> MakeArrowTimestamp(const LogicalType& logical - Result> FromByteArray(const LogicalType& logical_type) { - switch (logical_type.type()) { - case LogicalType::Type::STRING: -- return ::arrow::utf8(); -+ return ::arrow::large_utf8(); - case LogicalType::Type::DECIMAL: - return MakeArrowDecimal(logical_type); - case LogicalType::Type::NONE: -diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc -index 3294aaaf2..3cfdb6cb8 100644 ---- a/cpp/src/parquet/column_reader.cc -+++ b/cpp/src/parquet/column_reader.cc -@@ -2094,14 +2094,14 @@ class FLBARecordReader : public TypedRecordReader, - }; - - class ByteArrayChunkedRecordReader : public TypedRecordReader, -- virtual public BinaryRecordReader { -+ virtual public LargeBinaryRecordReader { - public: - ByteArrayChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info, - ::arrow::MemoryPool* pool, bool read_dense_for_nullable) - : TypedRecordReader(descr, leaf_info, pool, - read_dense_for_nullable) { - ARROW_DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY); -- accumulator_.builder = std::make_unique<::arrow::BinaryBuilder>(pool); -+ accumulator_.builder = std::make_unique<::arrow::LargeBinaryBuilder>(pool); - } - - ::arrow::ArrayVector GetBuilderChunks() override { -@@ -2213,7 +2213,7 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader, - private: - using BinaryDictDecoder = DictDecoder; - -- ::arrow::BinaryDictionary32Builder builder_; -+ ::arrow::BinaryDictionary64Builder builder_; - std::vector> result_chunks_; - }; - -diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h -index 334b8bcff..b652a89d8 100644 ---- a/cpp/src/parquet/column_reader.h -+++ b/cpp/src/parquet/column_reader.h -@@ -470,6 +470,11 @@ class BinaryRecordReader : virtual public RecordReader { - virtual std::vector> GetBuilderChunks() = 0; - }; - -+class LargeBinaryRecordReader : virtual public RecordReader { -+ public: -+ virtual std::vector> GetBuilderChunks() = 0; -+}; -+ - /// \brief Read records directly to dictionary-encoded Arrow form (int32 - /// indices). Only valid for BYTE_ARRAY columns - class DictionaryRecordReader : virtual public RecordReader { -diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc -index 134a22f28..b52cd3b30 100644 ---- a/cpp/src/parquet/encoding.cc -+++ b/cpp/src/parquet/encoding.cc -@@ -1271,7 +1271,7 @@ struct ArrowBinaryHelper { - Status AppendNull() { return builder->AppendNull(); } - - typename EncodingTraits::Accumulator* out; -- ::arrow::BinaryBuilder* builder; -+ ::arrow::LargeBinaryBuilder* builder; - int64_t chunk_space_remaining; - }; - -@@ -1349,7 +1349,7 @@ class PlainByteArrayDecoder : public PlainDecoder, - - int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, -- ::arrow::BinaryDictionary32Builder* builder) override { -+ ::arrow::BinaryDictionary64Builder* builder) override { - int result = 0; - PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, - valid_bits_offset, builder, &result)); -@@ -1862,7 +1862,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, - - int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, -- ::arrow::BinaryDictionary32Builder* builder) override { -+ ::arrow::BinaryDictionary64Builder* builder) override { - int result = 0; - if (null_count == 0) { - PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result)); -diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h -index 9f9b740ff..ab80284e6 100644 ---- a/cpp/src/parquet/encoding.h -+++ b/cpp/src/parquet/encoding.h -@@ -45,6 +45,8 @@ class NumericBuilder; - class FixedSizeBinaryBuilder; - template - class Dictionary32Builder; -+template -+class Dictionary64Builder; - - } // namespace arrow - -@@ -144,11 +146,11 @@ struct EncodingTraits { - /// \brief Internal helper class for decoding BYTE_ARRAY data where we can - /// overflow the capacity of a single arrow::BinaryArray - struct Accumulator { -- std::unique_ptr<::arrow::BinaryBuilder> builder; -+ std::unique_ptr<::arrow::LargeBinaryBuilder> builder; - std::vector> chunks; - }; -- using ArrowType = ::arrow::BinaryType; -- using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>; -+ using ArrowType = ::arrow::LargeBinaryType; -+ using DictAccumulator = ::arrow::Dictionary64Builder<::arrow::LargeBinaryType>; - }; - - template <> From ae62954a0ebd915ff9f81b74406db297107d8a0c Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 29 May 2023 10:34:31 -0300 Subject: [PATCH 03/69] intermediate stage, not working properly anymore.. --- .../parquet/parquet_arrow/reader_writer.cc | 14 +- cpp/src/arrow/memory_pool.cc | 3 + cpp/src/arrow/type.h | 5 +- cpp/src/parquet/arrow/reader.cc | 2 +- cpp/src/parquet/arrow/reader_internal.cc | 2 +- cpp/src/parquet/column_reader.cc | 195 ++++++-- cpp/src/parquet/column_reader.h | 5 +- cpp/src/parquet/encoding.cc | 445 +++++++++++++++++- cpp/src/parquet/encoding.h | 17 + cpp/src/parquet/properties.h | 16 + cpp/src/parquet/types.h | 32 ++ 11 files changed, 695 insertions(+), 41 deletions(-) diff --git a/cpp/examples/parquet/parquet_arrow/reader_writer.cc b/cpp/examples/parquet/parquet_arrow/reader_writer.cc index b4e28c662a88e..8357f380d106e 100644 --- a/cpp/examples/parquet/parquet_arrow/reader_writer.cc +++ b/cpp/examples/parquet/parquet_arrow/reader_writer.cc @@ -68,9 +68,19 @@ void read_whole_file(const std::string & filename) { arrow::default_memory_pool())); std::unique_ptr reader; - PARQUET_THROW_NOT_OK( - parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); + parquet::arrow::FileReaderBuilder builder; + + parquet::ReaderProperties props = parquet::default_reader_properties(); + +// props.set_use_binary_large_variants(true); + + PARQUET_THROW_NOT_OK(builder.Open(infile, props)); + + PARQUET_THROW_NOT_OK(builder.Build(&reader)); + +// PARQUET_THROW_NOT_OK( +// parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); std::shared_ptr table; PARQUET_THROW_NOT_OK(reader->ReadTable(&table)); diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc index 843329c17bc28..dd14953f7ff47 100644 --- a/cpp/src/arrow/memory_pool.cc +++ b/cpp/src/arrow/memory_pool.cc @@ -888,6 +888,9 @@ class PoolBuffer final : public ResizableBuffer { capacity_ = new_capacity; } } else { + if (new_size > static_cast(pow(2, 59))) { + assert(false); + } RETURN_NOT_OK(Reserve(new_size)); } size_ = new_size; diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 73c1a9d445398..820312e7a0c77 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -676,7 +677,9 @@ class ARROW_EXPORT BaseBinaryType : public DataType { ~BaseBinaryType() override; }; -constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max() - 1; +constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max() - 1; + +constexpr int64_t kLargeBinaryMemoryLimit = std::numeric_limits::max() - 1; // 2^35 /// \addtogroup binary-datatypes /// diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 40fbdcbb562b1..09b3c8c5f8fdd 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -462,7 +462,7 @@ class LeafReader : public ColumnReaderImpl { input_(std::move(input)), descr_(input_->descr()) { record_reader_ = RecordReader::Make( - descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY); + descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY, false, true /* large variants*/); NextRowGroup(); } diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index 3785eac26b284..b9c913bc24291 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -483,7 +483,7 @@ Status TransferBinary(RecordReader* reader, MemoryPool* pool, ::arrow::compute::CastOptions cast_options; cast_options.allow_invalid_utf8 = true; // avoid spending time validating UTF8 data - auto binary_reader = dynamic_cast(reader); + auto binary_reader = dynamic_cast(reader); DCHECK(binary_reader); auto chunks = binary_reader->GetBuilderChunks(); for (auto& chunk : chunks) { diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 3cfdb6cb83ca1..360c506dcff4a 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -1988,33 +1988,33 @@ class TypedRecordReader : public TypedColumnReaderImpl, } void DebugPrintState() override { - const int16_t* def_levels = this->def_levels(); - const int16_t* rep_levels = this->rep_levels(); - const int64_t total_levels_read = levels_position_; - - const T* vals = reinterpret_cast(this->values()); - - if (leaf_info_.def_level > 0) { - std::cout << "def levels: "; - for (int64_t i = 0; i < total_levels_read; ++i) { - std::cout << def_levels[i] << " "; - } - std::cout << std::endl; - } - - if (leaf_info_.rep_level > 0) { - std::cout << "rep levels: "; - for (int64_t i = 0; i < total_levels_read; ++i) { - std::cout << rep_levels[i] << " "; - } - std::cout << std::endl; - } - - std::cout << "values: "; - for (int64_t i = 0; i < this->values_written(); ++i) { - std::cout << vals[i] << " "; - } - std::cout << std::endl; +// const int16_t* def_levels = this->def_levels(); +// const int16_t* rep_levels = this->rep_levels(); +// const int64_t total_levels_read = levels_position_; +// +// const T* vals = reinterpret_cast(this->values()); +// +// if (leaf_info_.def_level > 0) { +// std::cout << "def levels: "; +// for (int64_t i = 0; i < total_levels_read; ++i) { +// std::cout << def_levels[i] << " "; +// } +// std::cout << std::endl; +// } +// +// if (leaf_info_.rep_level > 0) { +// std::cout << "rep levels: "; +// for (int64_t i = 0; i < total_levels_read; ++i) { +// std::cout << rep_levels[i] << " "; +// } +// std::cout << std::endl; +// } +// +// std::cout << "values: "; +// for (int64_t i = 0; i < this->values_written(); ++i) { +//// std::cout << vals[i] << " "; +// } +// std::cout << std::endl; } void ResetValues() { @@ -2094,14 +2094,14 @@ class FLBARecordReader : public TypedRecordReader, }; class ByteArrayChunkedRecordReader : public TypedRecordReader, - virtual public LargeBinaryRecordReader { + virtual public BinaryRecordReader { public: ByteArrayChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info, ::arrow::MemoryPool* pool, bool read_dense_for_nullable) : TypedRecordReader(descr, leaf_info, pool, read_dense_for_nullable) { ARROW_DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY); - accumulator_.builder = std::make_unique<::arrow::LargeBinaryBuilder>(pool); + accumulator_.builder = std::make_unique<::arrow::BinaryBuilder>(pool); } ::arrow::ArrayVector GetBuilderChunks() override { @@ -2135,6 +2135,48 @@ class ByteArrayChunkedRecordReader : public TypedRecordReader, typename EncodingTraits::Accumulator accumulator_; }; +class LargeByteArrayChunkedRecordReader : public TypedRecordReader, + virtual public LargeBinaryRecordReader { + public: + LargeByteArrayChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info, + ::arrow::MemoryPool* pool, bool read_dense_for_nullable) + : TypedRecordReader(descr, leaf_info, pool, + read_dense_for_nullable) { + ARROW_DCHECK_EQ(descr_->physical_type(), Type::LARGE_BYTE_ARRAY); + accumulator_.builder = std::make_unique<::arrow::LargeBinaryBuilder>(pool); + } + + ::arrow::ArrayVector GetBuilderChunks() override { + ::arrow::ArrayVector result = accumulator_.chunks; + if (result.size() == 0 || accumulator_.builder->length() > 0) { + std::shared_ptr<::arrow::Array> last_chunk; + PARQUET_THROW_NOT_OK(accumulator_.builder->Finish(&last_chunk)); + result.push_back(std::move(last_chunk)); + } + accumulator_.chunks = {}; + return result; + } + + void ReadValuesDense(int64_t values_to_read) override { + int64_t num_decoded = this->current_decoder_->DecodeArrowNonNull( + static_cast(values_to_read), &accumulator_); + CheckNumberDecoded(num_decoded, values_to_read); + ResetValues(); + } + + void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override { + int64_t num_decoded = this->current_decoder_->DecodeArrow( + static_cast(values_to_read), static_cast(null_count), + valid_bits_->mutable_data(), values_written_, &accumulator_); + CheckNumberDecoded(num_decoded, values_to_read - null_count); + ResetValues(); + } + + private: + // Helper data structure for accumulating builder chunks + typename EncodingTraits::Accumulator accumulator_; +}; + class ByteArrayDictionaryRecordReader : public TypedRecordReader, virtual public DictionaryRecordReader { public: @@ -2213,6 +2255,88 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader, private: using BinaryDictDecoder = DictDecoder; + ::arrow::BinaryDictionary32Builder builder_; + std::vector> result_chunks_; +}; + +class LargeByteArrayDictionaryRecordReader : public TypedRecordReader, + virtual public DictionaryRecordReader { + public: + LargeByteArrayDictionaryRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info, + ::arrow::MemoryPool* pool, bool read_dense_for_nullable) + : TypedRecordReader(descr, leaf_info, pool, read_dense_for_nullable), + builder_(pool) { + this->read_dictionary_ = true; + } + + std::shared_ptr<::arrow::ChunkedArray> GetResult() override { + FlushBuilder(); + std::vector> result; + std::swap(result, result_chunks_); + return std::make_shared<::arrow::ChunkedArray>(std::move(result), builder_.type()); + } + + void FlushBuilder() { + if (builder_.length() > 0) { + std::shared_ptr<::arrow::Array> chunk; + PARQUET_THROW_NOT_OK(builder_.Finish(&chunk)); + result_chunks_.emplace_back(std::move(chunk)); + + // Also clears the dictionary memo table + builder_.Reset(); + } + } + + void MaybeWriteNewDictionary() { + if (this->new_dictionary_) { + /// If there is a new dictionary, we may need to flush the builder, then + /// insert the new dictionary values + FlushBuilder(); + builder_.ResetFull(); + auto decoder = dynamic_cast(this->current_decoder_); + decoder->InsertDictionary(&builder_); + this->new_dictionary_ = false; + } + } + + void ReadValuesDense(int64_t values_to_read) override { + int64_t num_decoded = 0; + if (current_encoding_ == Encoding::RLE_DICTIONARY) { + MaybeWriteNewDictionary(); + auto decoder = dynamic_cast(this->current_decoder_); + num_decoded = decoder->DecodeIndices(static_cast(values_to_read), &builder_); + } else { + num_decoded = this->current_decoder_->DecodeArrowNonNull( + static_cast(values_to_read), &builder_); + + /// Flush values since they have been copied into the builder + ResetValues(); + } + CheckNumberDecoded(num_decoded, values_to_read); + } + + void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override { + int64_t num_decoded = 0; + if (current_encoding_ == Encoding::RLE_DICTIONARY) { + MaybeWriteNewDictionary(); + auto decoder = dynamic_cast(this->current_decoder_); + num_decoded = decoder->DecodeIndicesSpaced( + static_cast(values_to_read), static_cast(null_count), + valid_bits_->mutable_data(), values_written_, &builder_); + } else { + num_decoded = this->current_decoder_->DecodeArrow( + static_cast(values_to_read), static_cast(null_count), + valid_bits_->mutable_data(), values_written_, &builder_); + + /// Flush values since they have been copied into the builder + ResetValues(); + } + ARROW_DCHECK_EQ(num_decoded, values_to_read - null_count); + } + + private: + using LargeBinaryDictDecoder = DictDecoder; + ::arrow::BinaryDictionary64Builder builder_; std::vector> result_chunks_; }; @@ -2231,11 +2355,17 @@ std::shared_ptr MakeByteArrayRecordReader(const ColumnDescriptor* LevelInfo leaf_info, ::arrow::MemoryPool* pool, bool read_dictionary, - bool read_dense_for_nullable) { + bool read_dense_for_nullable, + bool use_binary_string_large_variants) { if (read_dictionary) { return std::make_shared(descr, leaf_info, pool, read_dense_for_nullable); } else { + if (use_binary_string_large_variants) { + return std::make_shared( + descr, leaf_info, pool, read_dense_for_nullable); + } + return std::make_shared(descr, leaf_info, pool, read_dense_for_nullable); } @@ -2246,7 +2376,8 @@ std::shared_ptr MakeByteArrayRecordReader(const ColumnDescriptor* std::shared_ptr RecordReader::Make(const ColumnDescriptor* descr, LevelInfo leaf_info, MemoryPool* pool, bool read_dictionary, - bool read_dense_for_nullable) { + bool read_dense_for_nullable, + bool use_binary_string_large_variants) { switch (descr->physical_type()) { case Type::BOOLEAN: return std::make_shared>(descr, leaf_info, pool, @@ -2268,7 +2399,7 @@ std::shared_ptr RecordReader::Make(const ColumnDescriptor* descr, read_dense_for_nullable); case Type::BYTE_ARRAY: { return MakeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary, - read_dense_for_nullable); + read_dense_for_nullable, use_binary_string_large_variants); } case Type::FIXED_LEN_BYTE_ARRAY: return std::make_shared(descr, leaf_info, pool, diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index b652a89d8cf3b..94f42e7db6563 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -321,7 +321,8 @@ class PARQUET_EXPORT RecordReader { static std::shared_ptr Make( const ColumnDescriptor* descr, LevelInfo leaf_info, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), - bool read_dictionary = false, bool read_dense_for_nullable = false); + bool read_dictionary = false, bool read_dense_for_nullable = false, + bool use_binary_string_large_variants = false); virtual ~RecordReader() = default; @@ -470,7 +471,7 @@ class BinaryRecordReader : virtual public RecordReader { virtual std::vector> GetBuilderChunks() = 0; }; -class LargeBinaryRecordReader : virtual public RecordReader { +class LargeBinaryRecordReader : virtual public BinaryRecordReader { public: virtual std::vector> GetBuilderChunks() = 0; }; diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index b52cd3b303c29..8e350f39f1393 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1271,6 +1271,43 @@ struct ArrowBinaryHelper { Status AppendNull() { return builder->AppendNull(); } typename EncodingTraits::Accumulator* out; + ::arrow::BinaryBuilder* builder; + int64_t chunk_space_remaining; +}; + +struct ArrowLargeBinaryHelper { + explicit ArrowLargeBinaryHelper(typename EncodingTraits::Accumulator* out) { + this->out = out; + this->builder = out->builder.get(); + this->chunk_space_remaining = + ::arrow::kLargeBinaryMemoryLimit - this->builder->value_data_length(); + } + + Status PushChunk() { + std::shared_ptr<::arrow::Array> result; + RETURN_NOT_OK(builder->Finish(&result)); + out->chunks.push_back(result); + chunk_space_remaining = ::arrow::kLargeBinaryMemoryLimit; + return Status::OK(); + } + + bool CanFit(int64_t length) const { return length <= chunk_space_remaining; } + + void UnsafeAppend(const uint8_t* data, int64_t length) { + chunk_space_remaining -= length; + builder->UnsafeAppend(data, length); + } + + void UnsafeAppendNull() { builder->UnsafeAppendNull(); } + + Status Append(const uint8_t* data, int64_t length) { + chunk_space_remaining -= length; + return builder->Append(data, length); + } + + Status AppendNull() { return builder->AppendNull(); } + + typename EncodingTraits::Accumulator* out; ::arrow::LargeBinaryBuilder* builder; int64_t chunk_space_remaining; }; @@ -1289,6 +1326,20 @@ inline int PlainDecoder::DecodeArrow( ParquetException::NYI(); } +template <> +inline int PlainDecoder::DecodeArrow( + int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, + typename EncodingTraits::Accumulator* builder) { + ParquetException::NYI(); +} + +template <> +inline int PlainDecoder::DecodeArrow( + int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, + typename EncodingTraits::DictAccumulator* builder) { + ParquetException::NYI(); +} + template <> inline int PlainDecoder::DecodeArrow( int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, @@ -1349,7 +1400,7 @@ class PlainByteArrayDecoder : public PlainDecoder, int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - ::arrow::BinaryDictionary64Builder* builder) override { + ::arrow::BinaryDictionary32Builder* builder) override { int result = 0; PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, valid_bits_offset, builder, &result)); @@ -1455,6 +1506,124 @@ class PlainByteArrayDecoder : public PlainDecoder, } }; +class PlainLargeByteArrayDecoder : public PlainDecoder, + virtual public LargeByteArrayDecoder { + public: + using Base = PlainDecoder; + using Base::DecodeSpaced; + using Base::PlainDecoder; + + // ---------------------------------------------------------------------- + // Dictionary read paths + + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + ::arrow::BinaryDictionary64Builder* builder) override { + int result = 0; + PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, + valid_bits_offset, builder, &result)); + return result; + } + + // ---------------------------------------------------------------------- + // Optimized dense binary read paths + + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::Accumulator* out) override { + int result = 0; + PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits, + valid_bits_offset, out, &result)); + return result; + } + + private: + Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::Accumulator* out, + int* out_values_decoded) { + ArrowLargeBinaryHelper helper(out); + int values_decoded = 0; + + RETURN_NOT_OK(helper.builder->Reserve(num_values)); + RETURN_NOT_OK(helper.builder->ReserveData( + std::min(len_, helper.chunk_space_remaining))); + + int i = 0; + RETURN_NOT_OK(VisitNullBitmapInline( + valid_bits, valid_bits_offset, num_values, null_count, + [&]() { + if (ARROW_PREDICT_FALSE(len_ < 4)) { + ParquetException::EofException(); + } + auto value_len = SafeLoadAs(data_); + if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) { + return Status::Invalid("Invalid or corrupted value_len '", value_len, "'"); + } + auto increment = value_len + 4; + if (ARROW_PREDICT_FALSE(len_ < increment)) { + ParquetException::EofException(); + } + if (ARROW_PREDICT_FALSE(!helper.CanFit(value_len))) { + // This element would exceed the capacity of a chunk + RETURN_NOT_OK(helper.PushChunk()); + RETURN_NOT_OK(helper.builder->Reserve(num_values - i)); + RETURN_NOT_OK(helper.builder->ReserveData( + std::min(len_, helper.chunk_space_remaining))); + } + helper.UnsafeAppend(data_ + 4, value_len); + data_ += increment; + len_ -= increment; + ++values_decoded; + ++i; + return Status::OK(); + }, + [&]() { + helper.UnsafeAppendNull(); + ++i; + return Status::OK(); + })); + + num_values_ -= values_decoded; + *out_values_decoded = values_decoded; + return Status::OK(); + } + + template + Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, BuilderType* builder, + int* out_values_decoded) { + RETURN_NOT_OK(builder->Reserve(num_values)); + int values_decoded = 0; + + RETURN_NOT_OK(VisitNullBitmapInline( + valid_bits, valid_bits_offset, num_values, null_count, + [&]() { + if (ARROW_PREDICT_FALSE(len_ < 4)) { + ParquetException::EofException(); + } + auto value_len = SafeLoadAs(data_); + if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) { + return Status::Invalid("Invalid or corrupted value_len '", value_len, "'"); + } + auto increment = value_len + 4; + if (ARROW_PREDICT_FALSE(len_ < increment)) { + ParquetException::EofException(); + } + RETURN_NOT_OK(builder->Append(data_ + 4, value_len)); + data_ += increment; + len_ -= increment; + ++values_decoded; + return Status::OK(); + }, + [&]() { return builder->AppendNull(); })); + + num_values_ -= values_decoded; + *out_values_decoded = values_decoded; + return Status::OK(); + } +}; + class PlainFLBADecoder : public PlainDecoder, virtual public FLBADecoder { public: using Base = PlainDecoder; @@ -1677,6 +1846,36 @@ void DictDecoderImpl::SetDict(TypedDecoder* dictio bytes_offsets[dictionary_length_] = offset; } +template <> +void DictDecoderImpl::SetDict(TypedDecoder* dictionary) { + [[maybe_unused]] auto z = dictionary->values_left(); + DecodeDict(dictionary); + + auto dict_values = reinterpret_cast(dictionary_->mutable_data()); + + uint64_t total_size = 0; + for (int i = 0; i < dictionary_length_; ++i) { + total_size += dict_values[i].len; + } + PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size, + /*shrink_to_fit=*/false)); + PARQUET_THROW_NOT_OK( + byte_array_offsets_->Resize((dictionary_length_ + 1) * sizeof(int64_t), + /*shrink_to_fit=*/false)); + + int64_t offset = 0; + uint8_t* bytes_data = byte_array_data_->mutable_data(); + int64_t* bytes_offsets = + reinterpret_cast(byte_array_offsets_->mutable_data()); + for (int i = 0; i < dictionary_length_; ++i) { + memcpy(bytes_data + offset, dict_values[i].ptr, dict_values[i].len); + bytes_offsets[i] = offset; + dict_values[i].ptr = bytes_data + offset; + offset += dict_values[i].len; + } + bytes_offsets[dictionary_length_] = offset; +} + template <> inline void DictDecoderImpl::SetDict(TypedDecoder* dictionary) { DecodeDict(dictionary); @@ -1723,6 +1922,20 @@ inline int DictDecoderImpl::DecodeArrow( ParquetException::NYI("DecodeArrow implemented elsewhere"); } +template <> +inline int DictDecoderImpl::DecodeArrow( + int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, + typename EncodingTraits::Accumulator* builder) { + ParquetException::NYI("DecodeArrow implemented elsewhere"); +} + +template <> +inline int DictDecoderImpl::DecodeArrow( + int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, + typename EncodingTraits::DictAccumulator* builder) { + ParquetException::NYI("DecodeArrow implemented elsewhere"); +} + template int DictDecoderImpl::DecodeArrow( int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, @@ -1854,6 +2067,16 @@ void DictDecoderImpl::InsertDictionary(::arrow::ArrayBuilder* bui PARQUET_THROW_NOT_OK(binary_builder->InsertMemoValues(*arr)); } +template <> +void DictDecoderImpl::InsertDictionary(::arrow::ArrayBuilder* builder) { + auto binary_builder = checked_cast<::arrow::BinaryDictionary64Builder*>(builder); + + // Make a BinaryArray referencing the internal dictionary data + auto arr = std::make_shared<::arrow::LargeBinaryArray>( + dictionary_length_, byte_array_offsets_, byte_array_data_); + PARQUET_THROW_NOT_OK(binary_builder->InsertMemoValues(*arr)); +} + class DictByteArrayDecoderImpl : public DictDecoderImpl, virtual public ByteArrayDecoder { public: @@ -1862,7 +2085,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - ::arrow::BinaryDictionary64Builder* builder) override { + ::arrow::BinaryDictionary32Builder* builder) override { int result = 0; if (null_count == 0) { PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result)); @@ -2068,6 +2291,220 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, } }; +class DictLargeByteArrayDecoderImpl : public DictDecoderImpl, + virtual public LargeByteArrayDecoder { + public: + using BASE = DictDecoderImpl; + using BASE::DictDecoderImpl; + + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + ::arrow::BinaryDictionary64Builder* builder) override { + int result = 0; + if (null_count == 0) { + PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result)); + } else { + PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, + valid_bits_offset, builder, &result)); + } + return result; + } + + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::Accumulator* out) override { + int result = 0; + if (null_count == 0) { + PARQUET_THROW_NOT_OK(DecodeArrowDenseNonNull(num_values, out, &result)); + } else { + PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits, + valid_bits_offset, out, &result)); + } + return result; + } + + private: + Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::Accumulator* out, + int* out_num_values) { + constexpr int32_t kBufferSize = 1024; + int32_t indices[kBufferSize]; + + ArrowLargeBinaryHelper helper(out); + + auto dict_values = reinterpret_cast(dictionary_->data()); + int values_decoded = 0; + int num_indices = 0; + int pos_indices = 0; + + auto visit_valid = [&](int64_t position) -> Status { + if (num_indices == pos_indices) { + // Refill indices buffer + const auto batch_size = + std::min(kBufferSize, num_values - null_count - values_decoded); + num_indices = idx_decoder_.GetBatch(indices, batch_size); + if (ARROW_PREDICT_FALSE(num_indices < 1)) { + return Status::Invalid("Invalid number of indices: ", num_indices); + } + pos_indices = 0; + } + const auto index = indices[pos_indices++]; + RETURN_NOT_OK(IndexInBounds(index)); + const auto& val = dict_values[index]; + if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) { + RETURN_NOT_OK(helper.PushChunk()); + } + RETURN_NOT_OK(helper.Append(val.ptr, static_cast(val.len))); + ++values_decoded; + return Status::OK(); + }; + + auto visit_null = [&]() -> Status { + RETURN_NOT_OK(helper.AppendNull()); + return Status::OK(); + }; + + ::arrow::internal::BitBlockCounter bit_blocks(valid_bits, valid_bits_offset, + num_values); + int64_t position = 0; + while (position < num_values) { + const auto block = bit_blocks.NextWord(); + if (block.AllSet()) { + for (int64_t i = 0; i < block.length; ++i, ++position) { + ARROW_RETURN_NOT_OK(visit_valid(position)); + } + } else if (block.NoneSet()) { + for (int64_t i = 0; i < block.length; ++i, ++position) { + ARROW_RETURN_NOT_OK(visit_null()); + } + } else { + for (int64_t i = 0; i < block.length; ++i, ++position) { + if (bit_util::GetBit(valid_bits, valid_bits_offset + position)) { + ARROW_RETURN_NOT_OK(visit_valid(position)); + } else { + ARROW_RETURN_NOT_OK(visit_null()); + } + } + } + } + + *out_num_values = values_decoded; + return Status::OK(); + } + + Status DecodeArrowDenseNonNull(int num_values, + typename EncodingTraits::Accumulator* out, + int* out_num_values) { + constexpr int32_t kBufferSize = 2048; + int32_t indices[kBufferSize]; + int values_decoded = 0; + + ArrowLargeBinaryHelper helper(out); + auto dict_values = reinterpret_cast(dictionary_->data()); + + while (values_decoded < num_values) { + int32_t batch_size = std::min(kBufferSize, num_values - values_decoded); + int num_indices = idx_decoder_.GetBatch(indices, batch_size); + if (num_indices == 0) ParquetException::EofException(); + for (int i = 0; i < num_indices; ++i) { + auto idx = indices[i]; + RETURN_NOT_OK(IndexInBounds(idx)); + const auto& val = dict_values[idx]; + if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) { + RETURN_NOT_OK(helper.PushChunk()); + } + RETURN_NOT_OK(helper.Append(val.ptr, static_cast(val.len))); + } + values_decoded += num_indices; + } + *out_num_values = values_decoded; + return Status::OK(); + } + + template + Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, BuilderType* builder, + int* out_num_values) { + constexpr int32_t kBufferSize = 1024; + int32_t indices[kBufferSize]; + + RETURN_NOT_OK(builder->Reserve(num_values)); + ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); + + auto dict_values = reinterpret_cast(dictionary_->data()); + + int values_decoded = 0; + int num_appended = 0; + while (num_appended < num_values) { + bool is_valid = bit_reader.IsSet(); + bit_reader.Next(); + + if (is_valid) { + int32_t batch_size = + std::min(kBufferSize, num_values - num_appended - null_count); + int num_indices = idx_decoder_.GetBatch(indices, batch_size); + + int i = 0; + while (true) { + // Consume all indices + if (is_valid) { + auto idx = indices[i]; + RETURN_NOT_OK(IndexInBounds(idx)); + const auto& val = dict_values[idx]; + RETURN_NOT_OK(builder->Append(val.ptr, val.len)); + ++i; + ++values_decoded; + } else { + RETURN_NOT_OK(builder->AppendNull()); + --null_count; + } + ++num_appended; + if (i == num_indices) { + // Do not advance the bit_reader if we have fulfilled the decode + // request + break; + } + is_valid = bit_reader.IsSet(); + bit_reader.Next(); + } + } else { + RETURN_NOT_OK(builder->AppendNull()); + --null_count; + ++num_appended; + } + } + *out_num_values = values_decoded; + return Status::OK(); + } + + template + Status DecodeArrowNonNull(int num_values, BuilderType* builder, int* out_num_values) { + constexpr int32_t kBufferSize = 2048; + int32_t indices[kBufferSize]; + + RETURN_NOT_OK(builder->Reserve(num_values)); + + auto dict_values = reinterpret_cast(dictionary_->data()); + + int values_decoded = 0; + while (values_decoded < num_values) { + int32_t batch_size = std::min(kBufferSize, num_values - values_decoded); + int num_indices = idx_decoder_.GetBatch(indices, batch_size); + if (num_indices == 0) ParquetException::EofException(); + for (int i = 0; i < num_indices; ++i) { + auto idx = indices[i]; + RETURN_NOT_OK(IndexInBounds(idx)); + const auto& val = dict_values[idx]; + RETURN_NOT_OK(builder->Append(val.ptr, val.len)); + } + values_decoded += num_indices; + } + *out_num_values = values_decoded; + return Status::OK(); + } +}; + // ---------------------------------------------------------------------- // DeltaBitPackEncoder @@ -3439,6 +3876,8 @@ std::unique_ptr MakeDecoder(Type::type type_num, Encoding::type encodin return std::make_unique>(descr); case Type::BYTE_ARRAY: return std::make_unique(descr); + case Type::LARGE_BYTE_ARRAY: + return std::make_unique(descr); case Type::FIXED_LEN_BYTE_ARRAY: return std::make_unique(descr); default: @@ -3504,6 +3943,8 @@ std::unique_ptr MakeDictDecoder(Type::type type_num, return std::make_unique>(descr, pool); case Type::BYTE_ARRAY: return std::make_unique(descr, pool); + case Type::LARGE_BYTE_ARRAY: + return std::make_unique(descr, pool); case Type::FIXED_LEN_BYTE_ARRAY: return std::make_unique>(descr, pool); default: diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index ab80284e6f83b..30345852912e8 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -62,6 +62,7 @@ using Int96Encoder = TypedEncoder; using FloatEncoder = TypedEncoder; using DoubleEncoder = TypedEncoder; using ByteArrayEncoder = TypedEncoder; +using LargeByteArrayEncoder = TypedEncoder; using FLBAEncoder = TypedEncoder; template @@ -74,6 +75,7 @@ using Int96Decoder = TypedDecoder; using FloatDecoder = TypedDecoder; using DoubleDecoder = TypedDecoder; using ByteArrayDecoder = TypedDecoder; +using LargeByteArrayDecoder = TypedDecoder; class FLBADecoder; template @@ -143,6 +145,21 @@ struct EncodingTraits { using Encoder = ByteArrayEncoder; using Decoder = ByteArrayDecoder; + /// \brief Internal helper class for decoding BYTE_ARRAY data where we can + /// overflow the capacity of a single arrow::BinaryArray + struct Accumulator { + std::unique_ptr<::arrow::BinaryBuilder> builder; + std::vector> chunks; + }; + using ArrowType = ::arrow::BinaryType; + using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>; +}; + +template <> +struct EncodingTraits { + using Encoder = LargeByteArrayEncoder; + using Decoder = LargeByteArrayDecoder; + /// \brief Internal helper class for decoding BYTE_ARRAY data where we can /// overflow the capacity of a single arrow::BinaryArray struct Accumulator { diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 0a9864de6266a..4da43ffe91b23 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -116,6 +116,12 @@ class PARQUET_EXPORT ReaderProperties { page_checksum_verification_ = check_crc; } + bool use_binary_large_variants() const { return use_binary_large_variants_; } + + void set_use_binary_large_variants(bool use_binary_large_variants) { + use_binary_large_variants_ = use_binary_large_variants; + } + private: MemoryPool* pool_; int64_t buffer_size_ = kDefaultBufferSize; @@ -124,6 +130,7 @@ class PARQUET_EXPORT ReaderProperties { bool buffered_stream_enabled_ = false; bool page_checksum_verification_ = false; std::shared_ptr file_decryption_properties_; + bool use_binary_large_variants_ = false; }; ReaderProperties PARQUET_EXPORT default_reader_properties(); @@ -851,6 +858,14 @@ class PARQUET_EXPORT ArrowReaderProperties { return coerce_int96_timestamp_unit_; } + void set_use_binary_large_variants(bool use_binary_large_variants) { + use_binary_large_variants_ = use_binary_large_variants; + } + + bool use_binary_large_variants() const { + return use_binary_large_variants_; + } + private: bool use_threads_; std::unordered_set read_dict_indices_; @@ -859,6 +874,7 @@ class PARQUET_EXPORT ArrowReaderProperties { ::arrow::io::IOContext io_context_; ::arrow::io::CacheOptions cache_options_; ::arrow::TimeUnit::type coerce_int96_timestamp_unit_; + bool use_binary_large_variants_; }; /// EXPERIMENTAL: Constructs the default ArrowReaderProperties diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index d4d6a73f147fc..3edabf6d311c2 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -64,6 +64,9 @@ struct Type { DOUBLE = 5, BYTE_ARRAY = 6, FIXED_LEN_BYTE_ARRAY = 7, + + // workaround + LARGE_BYTE_ARRAY = 8, // Should always be last element. UNDEFINED = 8 }; @@ -588,6 +591,26 @@ inline bool operator!=(const ByteArray& left, const ByteArray& right) { return !(left == right); } +struct LargeByteArray { + LargeByteArray() : len(0), ptr(NULLPTR) {} + LargeByteArray(uint64_t len, const uint8_t* ptr) : len(len), ptr(ptr) {} + + LargeByteArray(::std::string_view view) // NOLINT implicit conversion + : LargeByteArray(view.size(), + reinterpret_cast(view.data())) {} + uint64_t len; + const uint8_t* ptr; +}; + +inline bool operator==(const LargeByteArray& left, const LargeByteArray& right) { + return left.len == right.len && + (left.len == 0 || std::memcmp(left.ptr, right.ptr, left.len) == 0); +} + +inline bool operator!=(const LargeByteArray& left, const LargeByteArray& right) { + return !(left == right); +} + struct FixedLenByteArray { FixedLenByteArray() : ptr(NULLPTR) {} explicit FixedLenByteArray(const uint8_t* ptr) : ptr(ptr) {} @@ -740,6 +763,14 @@ struct type_traits { static constexpr const char* printf_code = "s"; }; +template<> +struct type_traits { + using value_type = LargeByteArray; + + static constexpr int value_byte_size = sizeof(LargeByteArray); + static constexpr const char* printf_code = "ls"; +}; + template <> struct type_traits { using value_type = FixedLenByteArray; @@ -761,6 +792,7 @@ using Int96Type = PhysicalType; using FloatType = PhysicalType; using DoubleType = PhysicalType; using ByteArrayType = PhysicalType; +using LargeByteArrayType = PhysicalType; using FLBAType = PhysicalType; template From 34917d5608330cd0961e32f8997f2e513e03dae2 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 29 May 2023 15:22:21 -0300 Subject: [PATCH 04/69] still not working --- cpp/src/parquet/arrow/reader.cc | 2 +- cpp/src/parquet/column_reader.cc | 4 ++-- cpp/src/parquet/encoding.cc | 11 +++++------ cpp/src/parquet/types.h | 4 ++-- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 09b3c8c5f8fdd..8d636810e7c69 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -462,7 +462,7 @@ class LeafReader : public ColumnReaderImpl { input_(std::move(input)), descr_(input_->descr()) { record_reader_ = RecordReader::Make( - descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY, false, true /* large variants*/); + descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY, false, true /* use_binary_string_large_variants */); NextRowGroup(); } diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 360c506dcff4a..e50f2262af32e 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -2142,7 +2142,7 @@ class LargeByteArrayChunkedRecordReader : public TypedRecordReader(descr, leaf_info, pool, read_dense_for_nullable) { - ARROW_DCHECK_EQ(descr_->physical_type(), Type::LARGE_BYTE_ARRAY); + ARROW_DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY); accumulator_.builder = std::make_unique<::arrow::LargeBinaryBuilder>(pool); } @@ -2362,7 +2362,7 @@ std::shared_ptr MakeByteArrayRecordReader(const ColumnDescriptor* read_dense_for_nullable); } else { if (use_binary_string_large_variants) { - return std::make_shared( + return std::make_shared( descr, leaf_info, pool, read_dense_for_nullable); } diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 8e350f39f1393..1aaca5b23f567 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1848,25 +1848,24 @@ void DictDecoderImpl::SetDict(TypedDecoder* dictio template <> void DictDecoderImpl::SetDict(TypedDecoder* dictionary) { - [[maybe_unused]] auto z = dictionary->values_left(); DecodeDict(dictionary); auto dict_values = reinterpret_cast(dictionary_->mutable_data()); - uint64_t total_size = 0; + uint32_t total_size = 0; for (int i = 0; i < dictionary_length_; ++i) { total_size += dict_values[i].len; } PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size, /*shrink_to_fit=*/false)); PARQUET_THROW_NOT_OK( - byte_array_offsets_->Resize((dictionary_length_ + 1) * sizeof(int64_t), + byte_array_offsets_->Resize((dictionary_length_ + 1) * sizeof(int32_t), /*shrink_to_fit=*/false)); - int64_t offset = 0; + int32_t offset = 0; uint8_t* bytes_data = byte_array_data_->mutable_data(); - int64_t* bytes_offsets = - reinterpret_cast(byte_array_offsets_->mutable_data()); + int32_t* bytes_offsets = + reinterpret_cast(byte_array_offsets_->mutable_data()); for (int i = 0; i < dictionary_length_; ++i) { memcpy(bytes_data + offset, dict_values[i].ptr, dict_values[i].len); bytes_offsets[i] = offset; diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 3edabf6d311c2..14091fa7dd156 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -593,12 +593,12 @@ inline bool operator!=(const ByteArray& left, const ByteArray& right) { struct LargeByteArray { LargeByteArray() : len(0), ptr(NULLPTR) {} - LargeByteArray(uint64_t len, const uint8_t* ptr) : len(len), ptr(ptr) {} + LargeByteArray(uint32_t len, const uint8_t* ptr) : len(len), ptr(ptr) {} LargeByteArray(::std::string_view view) // NOLINT implicit conversion : LargeByteArray(view.size(), reinterpret_cast(view.data())) {} - uint64_t len; + uint32_t len; const uint8_t* ptr; }; From 835b07dc688554c62efb0c604be421d7a2174943 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 29 May 2023 17:27:51 -0300 Subject: [PATCH 05/69] able to read the file again --- cpp/src/arrow/type.h | 2 +- cpp/src/parquet/arrow/reader.cc | 2 +- cpp/src/parquet/encoding.cc | 38 +++++++++++++++++++++++++++++++-- 3 files changed, 38 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 820312e7a0c77..57be85114187d 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -679,7 +679,7 @@ class ARROW_EXPORT BaseBinaryType : public DataType { constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max() - 1; -constexpr int64_t kLargeBinaryMemoryLimit = std::numeric_limits::max() - 1; // 2^35 +constexpr int64_t kLargeBinaryMemoryLimit = std::numeric_limits::max() - 1; // 2^35 /// \addtogroup binary-datatypes /// diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 8d636810e7c69..8e68e1701b8d3 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -462,7 +462,7 @@ class LeafReader : public ColumnReaderImpl { input_(std::move(input)), descr_(input_->descr()) { record_reader_ = RecordReader::Make( - descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY, false, true /* use_binary_string_large_variants */); + descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY, false, true /* use_binary_large_variants */); NextRowGroup(); } diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 1aaca5b23f567..3295df90d5933 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -1126,6 +1127,39 @@ inline int DecodePlain(const uint8_t* data, int64_t data_size, int nu return bytes_decoded; } +static inline int64_t ReadLargeByteArray(const uint8_t* data, int64_t data_size, + LargeByteArray* out) { + if (ARROW_PREDICT_FALSE(data_size < 4)) { + ParquetException::EofException(); + } + const int32_t len = SafeLoadAs(data); + if (len < 0) { + throw ParquetException("Invalid BYTE_ARRAY value"); + } + const int64_t consumed_length = static_cast(len) + 4; + if (ARROW_PREDICT_FALSE(data_size < consumed_length)) { + ParquetException::EofException(); + } + *out = LargeByteArray{static_cast(len), data + 4}; + return consumed_length; +} + +template <> +inline int DecodePlain(const uint8_t* data, int64_t data_size, int num_values, + int type_length, LargeByteArray* out) { + int bytes_decoded = 0; + for (int i = 0; i < num_values; ++i) { + const auto increment = ReadLargeByteArray(data, data_size, out + i); + if (ARROW_PREDICT_FALSE(increment > INT_MAX - bytes_decoded)) { + throw ParquetException("BYTE_ARRAY chunk too large"); + } + data += increment; + data_size -= increment; + bytes_decoded += static_cast(increment); + } + return bytes_decoded; +} + // Template specialization for FIXED_LEN_BYTE_ARRAY. The written values do not // own their own data. template <> @@ -1850,9 +1884,9 @@ template <> void DictDecoderImpl::SetDict(TypedDecoder* dictionary) { DecodeDict(dictionary); - auto dict_values = reinterpret_cast(dictionary_->mutable_data()); + auto dict_values = reinterpret_cast(dictionary_->mutable_data()); - uint32_t total_size = 0; + int total_size = 0; for (int i = 0; i < dictionary_length_; ++i) { total_size += dict_values[i].len; } From 50427c6af4b6099ae864a5216fe94fb97c5e3fbe Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 30 May 2023 09:49:51 -0300 Subject: [PATCH 06/69] move use_binary_large_variants to arrowreaderproperties --- cpp/examples/parquet/parquet_arrow/reader_writer.cc | 10 ++++++---- cpp/src/parquet/arrow/reader.cc | 3 ++- cpp/src/parquet/arrow/reader_internal.h | 1 + 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/cpp/examples/parquet/parquet_arrow/reader_writer.cc b/cpp/examples/parquet/parquet_arrow/reader_writer.cc index 8357f380d106e..eaf94c6ad7e96 100644 --- a/cpp/examples/parquet/parquet_arrow/reader_writer.cc +++ b/cpp/examples/parquet/parquet_arrow/reader_writer.cc @@ -71,11 +71,13 @@ void read_whole_file(const std::string & filename) { parquet::arrow::FileReaderBuilder builder; - parquet::ReaderProperties props = parquet::default_reader_properties(); + parquet::ArrowReaderProperties properties; -// props.set_use_binary_large_variants(true); + properties.set_use_binary_large_variants(true); - PARQUET_THROW_NOT_OK(builder.Open(infile, props)); + builder.properties(properties); + + PARQUET_THROW_NOT_OK(builder.Open(infile)); PARQUET_THROW_NOT_OK(builder.Build(&reader)); @@ -145,7 +147,7 @@ void read_single_column_chunk() { int main(int argc, char** argv) { // std::shared_ptr table = generate_table(); // write_parquet_file(*table); - read_whole_file("minimal_repro.parquet"); + read_whole_file("chunked_jira.parquet"); // read_single_rowgroup(); // read_single_column("minimal_repro.parquet"); // read_single_column_chunk(); diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 8e68e1701b8d3..b95be9cdd1415 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -219,6 +219,7 @@ class FileReaderImpl : public FileReader { ctx->iterator_factory = SomeRowGroupsFactory(row_groups); ctx->filter_leaves = true; ctx->included_leaves = included_leaves; + ctx->use_binary_large_variants = reader_properties_.use_binary_large_variants(); return GetReader(manifest_.schema_fields[i], ctx, out); } @@ -462,7 +463,7 @@ class LeafReader : public ColumnReaderImpl { input_(std::move(input)), descr_(input_->descr()) { record_reader_ = RecordReader::Make( - descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY, false, true /* use_binary_large_variants */); + descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY, false, ctx_->use_binary_large_variants); NextRowGroup(); } diff --git a/cpp/src/parquet/arrow/reader_internal.h b/cpp/src/parquet/arrow/reader_internal.h index cf9dbb86577b5..c5ee54b7c03d4 100644 --- a/cpp/src/parquet/arrow/reader_internal.h +++ b/cpp/src/parquet/arrow/reader_internal.h @@ -109,6 +109,7 @@ struct ReaderContext { FileColumnIteratorFactory iterator_factory; bool filter_leaves; std::shared_ptr> included_leaves; + bool use_binary_large_variants = false; bool IncludesLeaf(int leaf_index) const { if (this->filter_leaves) { From df65ce7087860e047552bf3e9e73cad39e010679 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 30 May 2023 10:14:54 -0300 Subject: [PATCH 07/69] cleanup a bit --- cpp/examples/arrow/parquet_read_write.cc | 2 +- cpp/examples/parquet/parquet_arrow/reader_writer.cc | 9 ++++----- cpp/src/arrow/memory_pool.cc | 3 --- cpp/src/arrow/type.h | 3 +-- cpp/src/parquet/column_scanner.h | 8 ++++++++ cpp/src/parquet/encoding.cc | 1 - cpp/src/parquet/types.h | 4 ++++ 7 files changed, 18 insertions(+), 12 deletions(-) diff --git a/cpp/examples/arrow/parquet_read_write.cc b/cpp/examples/arrow/parquet_read_write.cc index 20fe2c20b291a..3b8b4c2212b75 100644 --- a/cpp/examples/arrow/parquet_read_write.cc +++ b/cpp/examples/arrow/parquet_read_write.cc @@ -120,7 +120,7 @@ arrow::Status WriteFullFile(std::string path_to_file) { ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(), arrow::default_memory_pool(), outfile, - /*chunk_size=*/1024*1024*1024, props, arrow_props)); + /*chunk_size=*/3, props, arrow_props)); return arrow::Status::OK(); } diff --git a/cpp/examples/parquet/parquet_arrow/reader_writer.cc b/cpp/examples/parquet/parquet_arrow/reader_writer.cc index eaf94c6ad7e96..343914c367faa 100644 --- a/cpp/examples/parquet/parquet_arrow/reader_writer.cc +++ b/cpp/examples/parquet/parquet_arrow/reader_writer.cc @@ -56,7 +56,7 @@ void write_parquet_file(const arrow::Table& table) { // the parquet file. Normally you would choose this to be rather large but // for the example, we use a small value to have multiple RowGroups. PARQUET_THROW_NOT_OK( - parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 1024*1024 * 1024)); + parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 3)); } // #2: Fully read in the file @@ -108,18 +108,17 @@ void read_single_rowgroup() { } // #4: Read only a single column of the whole parquet file -void read_single_column(const std::string & filename) { - std::cout << "Reading first column of " << filename << std::endl; +void read_single_column() { std::shared_ptr infile; PARQUET_ASSIGN_OR_THROW(infile, - arrow::io::ReadableFile::Open(filename, + arrow::io::ReadableFile::Open("parquet-arrow-example.parquet", arrow::default_memory_pool())); std::unique_ptr reader; PARQUET_THROW_NOT_OK( parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); std::shared_ptr array; - PARQUET_THROW_NOT_OK(reader->ReadColumn(5, &array)); + PARQUET_THROW_NOT_OK(reader->ReadColumn(0, &array)); PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout)); std::cout << std::endl; } diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc index dd14953f7ff47..843329c17bc28 100644 --- a/cpp/src/arrow/memory_pool.cc +++ b/cpp/src/arrow/memory_pool.cc @@ -888,9 +888,6 @@ class PoolBuffer final : public ResizableBuffer { capacity_ = new_capacity; } } else { - if (new_size > static_cast(pow(2, 59))) { - assert(false); - } RETURN_NOT_OK(Reserve(new_size)); } size_ = new_size; diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 57be85114187d..64d2893a9725c 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -19,7 +19,6 @@ #include #include -#include #include #include #include @@ -679,7 +678,7 @@ class ARROW_EXPORT BaseBinaryType : public DataType { constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max() - 1; -constexpr int64_t kLargeBinaryMemoryLimit = std::numeric_limits::max() - 1; // 2^35 +constexpr int64_t kLargeBinaryMemoryLimit = std::numeric_limits::max() - 1; /// \addtogroup binary-datatypes /// diff --git a/cpp/src/parquet/column_scanner.h b/cpp/src/parquet/column_scanner.h index d53435f03cd32..7bea4ca24d6db 100644 --- a/cpp/src/parquet/column_scanner.h +++ b/cpp/src/parquet/column_scanner.h @@ -225,6 +225,14 @@ inline void TypedScanner::FormatValue(void* val, char* buffer, in snprintf(buffer, bufsize, fmt.c_str(), result.c_str()); } +template <> +inline void TypedScanner::FormatValue(void* val, char* buffer, int bufsize, + int width) { + std::string fmt = format_fwf(width); + std::string result = LargeByteArrayToString(*reinterpret_cast(val)); + snprintf(buffer, bufsize, fmt.c_str(), result.c_str()); +} + template <> inline void TypedScanner::FormatValue(void* val, char* buffer, int bufsize, int width) { diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 3295df90d5933..f124db0736875 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 14091fa7dd156..226d0c5730a7b 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -645,6 +645,10 @@ static inline std::string ByteArrayToString(const ByteArray& a) { return std::string(reinterpret_cast(a.ptr), a.len); } +static inline std::string LargeByteArrayToString(const LargeByteArray& a) { + return std::string(reinterpret_cast(a.ptr), a.len); +} + static inline void Int96SetNanoSeconds(parquet::Int96& i96, int64_t nanoseconds) { std::memcpy(&i96.value, &nanoseconds, sizeof(nanoseconds)); } From e826b8e82de3ea361599f1e9b600861fc503d227 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 30 May 2023 11:08:06 -0300 Subject: [PATCH 08/69] back fromByteArray string & binary with setting --- .../parquet/parquet_arrow/reader_writer.cc | 12 ++++++------ cpp/src/parquet/arrow/schema.cc | 2 +- cpp/src/parquet/arrow/schema_internal.cc | 15 ++++++++------- cpp/src/parquet/arrow/schema_internal.h | 11 +++++++---- 4 files changed, 22 insertions(+), 18 deletions(-) diff --git a/cpp/examples/parquet/parquet_arrow/reader_writer.cc b/cpp/examples/parquet/parquet_arrow/reader_writer.cc index 343914c367faa..6468edd67534d 100644 --- a/cpp/examples/parquet/parquet_arrow/reader_writer.cc +++ b/cpp/examples/parquet/parquet_arrow/reader_writer.cc @@ -84,6 +84,10 @@ void read_whole_file(const std::string & filename) { // PARQUET_THROW_NOT_OK( // parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); + std::shared_ptr<::arrow::Schema> schema; + + [[maybe_unused]] auto metadata = reader->GetSchema(&schema); + std::shared_ptr table; PARQUET_THROW_NOT_OK(reader->ReadTable(&table)); std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns() @@ -144,10 +148,6 @@ void read_single_column_chunk() { } int main(int argc, char** argv) { -// std::shared_ptr table = generate_table(); -// write_parquet_file(*table); - read_whole_file("chunked_jira.parquet"); -// read_single_rowgroup(); -// read_single_column("minimal_repro.parquet"); -// read_single_column_chunk(); +// read_whole_file("chunked_jira.parquet"); + read_whole_file("minimal_repro.parquet"); } diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index c5d5e0743a7f1..4920bad21f0df 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -473,7 +473,7 @@ ::arrow::Result> GetTypeForNode( SchemaTreeContext* ctx) { ASSIGN_OR_RAISE( std::shared_ptr storage_type, - GetArrowType(primitive_node, ctx->properties.coerce_int96_timestamp_unit())); + GetArrowType(primitive_node, ctx->properties.coerce_int96_timestamp_unit(), ctx->properties.use_binary_large_variants())); if (ctx->properties.read_dictionary(column_index) && IsDictionaryReadSupported(*storage_type)) { return ::arrow::dictionary(::arrow::int32(), storage_type); diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc index dbff14d93b84e..b9f6bea7e0ecb 100644 --- a/cpp/src/parquet/arrow/schema_internal.cc +++ b/cpp/src/parquet/arrow/schema_internal.cc @@ -110,17 +110,17 @@ Result> MakeArrowTimestamp(const LogicalType& logical } } -Result> FromByteArray(const LogicalType& logical_type) { +Result> FromByteArray(const LogicalType& logical_type, bool use_binary_large_variant) { switch (logical_type.type()) { case LogicalType::Type::STRING: - return ::arrow::large_utf8(); + return use_binary_large_variant ? ::arrow::large_utf8() : ::arrow::utf8(); case LogicalType::Type::DECIMAL: return MakeArrowDecimal(logical_type); case LogicalType::Type::NONE: case LogicalType::Type::ENUM: case LogicalType::Type::JSON: case LogicalType::Type::BSON: - return ::arrow::binary(); + return use_binary_large_variant ? ::arrow::large_binary() : ::arrow::binary(); default: return Status::NotImplemented("Unhandled logical logical_type ", logical_type.ToString(), " for binary array"); @@ -181,7 +181,7 @@ Result> FromInt64(const LogicalType& logical_type) { Result> GetArrowType( Type::type physical_type, const LogicalType& logical_type, int type_length, - const ::arrow::TimeUnit::type int96_arrow_time_unit) { + const ::arrow::TimeUnit::type int96_arrow_time_unit, bool use_binary_large_variant) { if (logical_type.is_invalid() || logical_type.is_null()) { return ::arrow::null(); } @@ -200,7 +200,7 @@ Result> GetArrowType( case ParquetType::DOUBLE: return ::arrow::float64(); case ParquetType::BYTE_ARRAY: - return FromByteArray(logical_type); + return FromByteArray(logical_type, use_binary_large_variant); case ParquetType::FIXED_LEN_BYTE_ARRAY: return FromFLBA(logical_type, type_length); default: { @@ -213,9 +213,10 @@ Result> GetArrowType( Result> GetArrowType( const schema::PrimitiveNode& primitive, - const ::arrow::TimeUnit::type int96_arrow_time_unit) { + const ::arrow::TimeUnit::type int96_arrow_time_unit, + bool use_binary_large_variant) { return GetArrowType(primitive.physical_type(), *primitive.logical_type(), - primitive.type_length(), int96_arrow_time_unit); + primitive.type_length(), int96_arrow_time_unit, use_binary_large_variant); } } // namespace arrow diff --git a/cpp/src/parquet/arrow/schema_internal.h b/cpp/src/parquet/arrow/schema_internal.h index fb837c3ee6cab..e17f2d2d07c5b 100644 --- a/cpp/src/parquet/arrow/schema_internal.h +++ b/cpp/src/parquet/arrow/schema_internal.h @@ -29,7 +29,7 @@ namespace arrow { using ::arrow::Result; -Result> FromByteArray(const LogicalType& logical_type); +Result> FromByteArray(const LogicalType& logical_type, bool use_binary_large_variant = false); Result> FromFLBA(const LogicalType& logical_type, int32_t physical_length); Result> FromInt32(const LogicalType& logical_type); @@ -37,15 +37,18 @@ Result> FromInt64(const LogicalType& logical_ Result> GetArrowType(Type::type physical_type, const LogicalType& logical_type, - int type_length); + int type_length, + bool use_binary_large_variant = false); Result> GetArrowType( Type::type physical_type, const LogicalType& logical_type, int type_length, - ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO); + ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO, + bool use_binary_large_variant = false); Result> GetArrowType( const schema::PrimitiveNode& primitive, - ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO); + ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO, + bool use_binary_large_variant = false); } // namespace arrow } // namespace parquet From 764ef98a0cf4fb94416e1fb732e60219a13c84af Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 30 May 2023 13:40:39 -0300 Subject: [PATCH 09/69] some more adjustments --- cpp/src/parquet/arrow/schema_internal.cc | 25 ++++++++++++++++++++---- cpp/src/parquet/arrow/schema_internal.h | 4 +++- cpp/src/parquet/column_reader.h | 1 + cpp/src/parquet/column_writer.h | 1 + cpp/src/parquet/metadata.cc | 2 ++ cpp/src/parquet/page_index.cc | 4 ++++ cpp/src/parquet/stream_reader.cc | 3 +++ cpp/src/parquet/stream_writer.cc | 4 ++++ cpp/src/parquet/types.cc | 1 + cpp/src/parquet/types.h | 5 +++-- 10 files changed, 43 insertions(+), 7 deletions(-) diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc index b9f6bea7e0ecb..a256ec4a6d7f9 100644 --- a/cpp/src/parquet/arrow/schema_internal.cc +++ b/cpp/src/parquet/arrow/schema_internal.cc @@ -110,17 +110,34 @@ Result> MakeArrowTimestamp(const LogicalType& logical } } -Result> FromByteArray(const LogicalType& logical_type, bool use_binary_large_variant) { +Result> FromByteArray(const LogicalType& logical_type) { switch (logical_type.type()) { case LogicalType::Type::STRING: - return use_binary_large_variant ? ::arrow::large_utf8() : ::arrow::utf8(); + return ::arrow::utf8(); case LogicalType::Type::DECIMAL: return MakeArrowDecimal(logical_type); case LogicalType::Type::NONE: case LogicalType::Type::ENUM: case LogicalType::Type::JSON: case LogicalType::Type::BSON: - return use_binary_large_variant ? ::arrow::large_binary() : ::arrow::binary(); + return ::arrow::binary(); + default: + return Status::NotImplemented("Unhandled logical logical_type ", + logical_type.ToString(), " for binary array"); + } +} + +Result> FromLargeByteArray(const LogicalType& logical_type) { + switch (logical_type.type()) { + case LogicalType::Type::STRING: + return ::arrow::large_utf8(); + case LogicalType::Type::DECIMAL: + return MakeArrowDecimal(logical_type); + case LogicalType::Type::NONE: + case LogicalType::Type::ENUM: + case LogicalType::Type::JSON: + case LogicalType::Type::BSON: + return ::arrow::large_binary(); default: return Status::NotImplemented("Unhandled logical logical_type ", logical_type.ToString(), " for binary array"); @@ -200,7 +217,7 @@ Result> GetArrowType( case ParquetType::DOUBLE: return ::arrow::float64(); case ParquetType::BYTE_ARRAY: - return FromByteArray(logical_type, use_binary_large_variant); + return use_binary_large_variant ? FromLargeByteArray(logical_type) : FromByteArray(logical_type); case ParquetType::FIXED_LEN_BYTE_ARRAY: return FromFLBA(logical_type, type_length); default: { diff --git a/cpp/src/parquet/arrow/schema_internal.h b/cpp/src/parquet/arrow/schema_internal.h index e17f2d2d07c5b..d27440ea22301 100644 --- a/cpp/src/parquet/arrow/schema_internal.h +++ b/cpp/src/parquet/arrow/schema_internal.h @@ -29,7 +29,9 @@ namespace arrow { using ::arrow::Result; -Result> FromByteArray(const LogicalType& logical_type, bool use_binary_large_variant = false); +Result> FromByteArray(const LogicalType& logical_type); +Result> FromLargeByteArray(const LogicalType& logical_type); + Result> FromFLBA(const LogicalType& logical_type, int32_t physical_length); Result> FromInt32(const LogicalType& logical_type); diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 94f42e7db6563..2c6dfea9d39a1 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -492,6 +492,7 @@ using Int96Reader = TypedColumnReader; using FloatReader = TypedColumnReader; using DoubleReader = TypedColumnReader; using ByteArrayReader = TypedColumnReader; +using LargeByteArrayReader = TypedColumnReader; using FixedLenByteArrayReader = TypedColumnReader; } // namespace parquet diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 792b108ac8835..545ecbb6732f8 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -233,6 +233,7 @@ using Int96Writer = TypedColumnWriter; using FloatWriter = TypedColumnWriter; using DoubleWriter = TypedColumnWriter; using ByteArrayWriter = TypedColumnWriter; +using LargeByteArrayWriter = TypedColumnWriter; using FixedLenByteArrayWriter = TypedColumnWriter; namespace internal { diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 0bbd96580774a..055e679a9b685 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -123,6 +123,8 @@ std::shared_ptr MakeColumnStats(const format::ColumnMetaData& meta_d return MakeTypedColumnStats(meta_data, descr); case Type::BYTE_ARRAY: return MakeTypedColumnStats(meta_data, descr); + case Type::LARGE_BYTE_ARRAY: + return MakeTypedColumnStats(meta_data, descr); case Type::FIXED_LEN_BYTE_ARRAY: return MakeTypedColumnStats(meta_data, descr); case Type::UNDEFINED: diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index d29cc33eb5afd..969db469bbeb5 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -853,6 +853,8 @@ std::unique_ptr ColumnIndex::Make(const ColumnDescriptor& descr, return std::make_unique>(descr, column_index); case Type::BYTE_ARRAY: return std::make_unique>(descr, column_index); + case Type::LARGE_BYTE_ARRAY: + return std::make_unique>(descr, column_index); case Type::FIXED_LEN_BYTE_ARRAY: return std::make_unique>(descr, column_index); case Type::UNDEFINED: @@ -897,6 +899,8 @@ std::unique_ptr ColumnIndexBuilder::Make( return std::make_unique>(descr); case Type::BYTE_ARRAY: return std::make_unique>(descr); + case Type::LARGE_BYTE_ARRAY: + return std::make_unique>(descr); case Type::FIXED_LEN_BYTE_ARRAY: return std::make_unique>(descr); case Type::UNDEFINED: diff --git a/cpp/src/parquet/stream_reader.cc b/cpp/src/parquet/stream_reader.cc index 0fecb1bf24615..66bcf5ca97560 100644 --- a/cpp/src/parquet/stream_reader.cc +++ b/cpp/src/parquet/stream_reader.cc @@ -488,6 +488,9 @@ void StreamReader::SkipRowsInColumn(ColumnReader* reader, int64_t num_rows_to_sk case Type::BYTE_ARRAY: num_skipped = static_cast(reader)->Skip(num_rows_to_skip); break; + case Type::LARGE_BYTE_ARRAY: + num_skipped = static_cast(reader)->Skip(num_rows_to_skip); + break; case Type::FIXED_LEN_BYTE_ARRAY: num_skipped = static_cast(reader)->Skip(num_rows_to_skip); break; diff --git a/cpp/src/parquet/stream_writer.cc b/cpp/src/parquet/stream_writer.cc index 856436d701816..d93368740f9a8 100644 --- a/cpp/src/parquet/stream_writer.cc +++ b/cpp/src/parquet/stream_writer.cc @@ -251,6 +251,10 @@ void StreamWriter::WriteNullValue(ColumnWriter* writer) { static_cast(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero, &kRepLevelZero, nullptr); break; + case Type::LARGE_BYTE_ARRAY: + static_cast(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero, + &kRepLevelZero, nullptr); + break; case Type::FIXED_LEN_BYTE_ARRAY: static_cast(writer)->WriteBatch( kBatchSizeOne, &kDefLevelZero, &kRepLevelZero, nullptr); diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index 28f472aaf9dd8..d5d0442177934 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -260,6 +260,7 @@ SortOrder::type DefaultSortOrder(Type::type primitive) { case Type::DOUBLE: return SortOrder::SIGNED; case Type::BYTE_ARRAY: + case Type::LARGE_BYTE_ARRAY: case Type::FIXED_LEN_BYTE_ARRAY: return SortOrder::UNSIGNED; case Type::INT96: diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 226d0c5730a7b..3a16a84e9e3a0 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -65,10 +65,11 @@ struct Type { BYTE_ARRAY = 6, FIXED_LEN_BYTE_ARRAY = 7, - // workaround + // This parquet type does not actually exist (AFAIK) and is used to + // create proper type traits LARGE_BYTE_ARRAY = 8, // Should always be last element. - UNDEFINED = 8 + UNDEFINED = 9 }; }; From c6244eac25e40f64a72483be01ec038dfe2a1e26 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 30 May 2023 13:52:40 -0300 Subject: [PATCH 10/69] revert some stuff --- .../parquet/parquet_arrow/reader_writer.cc | 39 +++++++------------ 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/cpp/examples/parquet/parquet_arrow/reader_writer.cc b/cpp/examples/parquet/parquet_arrow/reader_writer.cc index 6468edd67534d..debf62736bdd0 100644 --- a/cpp/examples/parquet/parquet_arrow/reader_writer.cc +++ b/cpp/examples/parquet/parquet_arrow/reader_writer.cc @@ -60,34 +60,16 @@ void write_parquet_file(const arrow::Table& table) { } // #2: Fully read in the file -void read_whole_file(const std::string & filename) { - std::cout << "Reading " << filename << " at once" << std::endl; +void read_whole_file() { + std::cout << "Reading parquet-arrow-example.parquet at once" << std::endl; std::shared_ptr infile; PARQUET_ASSIGN_OR_THROW(infile, - arrow::io::ReadableFile::Open(filename, + arrow::io::ReadableFile::Open("parquet-arrow-example.parquet", arrow::default_memory_pool())); std::unique_ptr reader; - - parquet::arrow::FileReaderBuilder builder; - - parquet::ArrowReaderProperties properties; - - properties.set_use_binary_large_variants(true); - - builder.properties(properties); - - PARQUET_THROW_NOT_OK(builder.Open(infile)); - - PARQUET_THROW_NOT_OK(builder.Build(&reader)); - -// PARQUET_THROW_NOT_OK( -// parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); - - std::shared_ptr<::arrow::Schema> schema; - - [[maybe_unused]] auto metadata = reader->GetSchema(&schema); - + PARQUET_THROW_NOT_OK( + parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); std::shared_ptr table; PARQUET_THROW_NOT_OK(reader->ReadTable(&table)); std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns() @@ -113,6 +95,7 @@ void read_single_rowgroup() { // #4: Read only a single column of the whole parquet file void read_single_column() { + std::cout << "Reading first column of parquet-arrow-example.parquet" << std::endl; std::shared_ptr infile; PARQUET_ASSIGN_OR_THROW(infile, arrow::io::ReadableFile::Open("parquet-arrow-example.parquet", @@ -148,6 +131,10 @@ void read_single_column_chunk() { } int main(int argc, char** argv) { -// read_whole_file("chunked_jira.parquet"); - read_whole_file("minimal_repro.parquet"); -} + std::shared_ptr table = generate_table(); + write_parquet_file(*table); + read_whole_file(); + read_single_rowgroup(); + read_single_column(); + read_single_column_chunk(); +} \ No newline at end of file From 5a4bbb0dbae93007ed135dc5377d573a7caeeed6 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 30 May 2023 13:56:17 -0300 Subject: [PATCH 11/69] revert some stuff --- cpp/CMakePresets.json | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json index 40ccd64a93695..7882be57a0534 100644 --- a/cpp/CMakePresets.json +++ b/cpp/CMakePresets.json @@ -220,12 +220,7 @@ "features-main" ], "displayName": "Debug build with tests and more optional components", - "cacheVariables": { - "ARROW_BUILD_EXAMPLES": "ON", - "PARQUET_BUILD_EXAMPLES": "ON", - "ARROW_BUILD_TESTS": "ON", - "ARROW_BUILD_UTILITIES": "ON" - } + "cacheVariables": {} }, { "name": "ninja-debug-cuda", From 2d84e57149e49e824cc874b5c8aff7c372fd0305 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 30 May 2023 14:01:43 -0300 Subject: [PATCH 12/69] improvement --- cpp/src/parquet/column_reader.cc | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index e50f2262af32e..01197507890b9 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -2355,22 +2355,30 @@ std::shared_ptr MakeByteArrayRecordReader(const ColumnDescriptor* LevelInfo leaf_info, ::arrow::MemoryPool* pool, bool read_dictionary, - bool read_dense_for_nullable, - bool use_binary_string_large_variants) { + bool read_dense_for_nullable) { if (read_dictionary) { return std::make_shared(descr, leaf_info, pool, read_dense_for_nullable); } else { - if (use_binary_string_large_variants) { - return std::make_shared( - descr, leaf_info, pool, read_dense_for_nullable); - } - return std::make_shared(descr, leaf_info, pool, read_dense_for_nullable); } } +std::shared_ptr MakeLargeByteArrayRecordReader(const ColumnDescriptor* descr, + LevelInfo leaf_info, + ::arrow::MemoryPool* pool, + bool read_dictionary, + bool read_dense_for_nullable) { + if (read_dictionary) { + return std::make_shared(descr, leaf_info, pool, + read_dense_for_nullable); + } else { + return std::make_shared( + descr, leaf_info, pool, read_dense_for_nullable); + } +} + } // namespace std::shared_ptr RecordReader::Make(const ColumnDescriptor* descr, @@ -2398,8 +2406,10 @@ std::shared_ptr RecordReader::Make(const ColumnDescriptor* descr, return std::make_shared>(descr, leaf_info, pool, read_dense_for_nullable); case Type::BYTE_ARRAY: { - return MakeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary, - read_dense_for_nullable, use_binary_string_large_variants); + return use_binary_string_large_variants ? MakeLargeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary, + read_dense_for_nullable) + : MakeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary, + read_dense_for_nullable); } case Type::FIXED_LEN_BYTE_ARRAY: return std::make_shared(descr, leaf_info, pool, From 90f14df903ae8e416aea1d2b6c032b9c501a5fb1 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 30 May 2023 14:35:44 -0300 Subject: [PATCH 13/69] remove dictionary64 --- cpp/src/arrow/array/builder_dict.h | 25 +------------------------ cpp/src/parquet/column_reader.cc | 2 +- cpp/src/parquet/encoding.cc | 6 +++--- cpp/src/parquet/encoding.h | 4 +--- 4 files changed, 6 insertions(+), 31 deletions(-) diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index 9a248dc6fe393..3adf5b843b916 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -715,29 +715,6 @@ class Dictionary32Builder : public internal::DictionaryBuilderBase -class Dictionary64Builder : public internal::DictionaryBuilderBase { - public: - using BASE = internal::DictionaryBuilderBase; - using BASE::BASE; - - /// \brief Append dictionary indices directly without modifying memo - /// - /// NOTE: Experimental API - Status AppendIndices(const int64_t* values, int64_t length, - const uint8_t* valid_bytes = NULLPTR) { - int64_t null_count_before = this->indices_builder_.null_count(); - ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes)); - this->capacity_ = this->indices_builder_.capacity(); - this->length_ += length; - this->null_count_ += this->indices_builder_.null_count() - null_count_before; - return Status::OK(); - } -}; - // ---------------------------------------------------------------------- // Binary / Unicode builders // (compatibility aliases; those used to be derived classes with additional @@ -747,7 +724,7 @@ using BinaryDictionaryBuilder = DictionaryBuilder; using StringDictionaryBuilder = DictionaryBuilder; using BinaryDictionary32Builder = Dictionary32Builder; using StringDictionary32Builder = Dictionary32Builder; -using BinaryDictionary64Builder = Dictionary64Builder; +using LargeBinaryDictionary32Builder = Dictionary32Builder; /// @} diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 01197507890b9..7666cbff5f104 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -2337,7 +2337,7 @@ class LargeByteArrayDictionaryRecordReader : public TypedRecordReader; - ::arrow::BinaryDictionary64Builder builder_; + ::arrow::LargeBinaryDictionary32Builder builder_; std::vector> result_chunks_; }; diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index f124db0736875..634ce1496c75e 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1551,7 +1551,7 @@ class PlainLargeByteArrayDecoder : public PlainDecoder, int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - ::arrow::BinaryDictionary64Builder* builder) override { + ::arrow::LargeBinaryDictionary32Builder* builder) override { int result = 0; PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, valid_bits_offset, builder, &result)); @@ -2101,7 +2101,7 @@ void DictDecoderImpl::InsertDictionary(::arrow::ArrayBuilder* bui template <> void DictDecoderImpl::InsertDictionary(::arrow::ArrayBuilder* builder) { - auto binary_builder = checked_cast<::arrow::BinaryDictionary64Builder*>(builder); + auto binary_builder = checked_cast<::arrow::LargeBinaryDictionary32Builder*>(builder); // Make a BinaryArray referencing the internal dictionary data auto arr = std::make_shared<::arrow::LargeBinaryArray>( @@ -2331,7 +2331,7 @@ class DictLargeByteArrayDecoderImpl : public DictDecoderImpl int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - ::arrow::BinaryDictionary64Builder* builder) override { + ::arrow::LargeBinaryDictionary32Builder* builder) override { int result = 0; if (null_count == 0) { PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result)); diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index 30345852912e8..1218a650238d7 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -45,8 +45,6 @@ class NumericBuilder; class FixedSizeBinaryBuilder; template class Dictionary32Builder; -template -class Dictionary64Builder; } // namespace arrow @@ -167,7 +165,7 @@ struct EncodingTraits { std::vector> chunks; }; using ArrowType = ::arrow::LargeBinaryType; - using DictAccumulator = ::arrow::Dictionary64Builder<::arrow::LargeBinaryType>; + using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::LargeBinaryType>; }; template <> From b88b024c9352e1a42daad84472ef602bc639f96b Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 31 May 2023 10:58:45 -0300 Subject: [PATCH 14/69] use 64bit on largebytearray class and initialize binary_large_variant bool to false --- cpp/src/parquet/properties.h | 3 ++- cpp/src/parquet/types.h | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 4da43ffe91b23..26dfcaeb320ad 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -790,7 +790,8 @@ class PARQUET_EXPORT ArrowReaderProperties { batch_size_(kArrowDefaultBatchSize), pre_buffer_(false), cache_options_(::arrow::io::CacheOptions::Defaults()), - coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO) {} + coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO), + use_binary_large_variants_(false) {} /// \brief Set whether to use the IO thread pool to parse columns in parallel. /// diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 3a16a84e9e3a0..cca6247922ce2 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -594,12 +594,12 @@ inline bool operator!=(const ByteArray& left, const ByteArray& right) { struct LargeByteArray { LargeByteArray() : len(0), ptr(NULLPTR) {} - LargeByteArray(uint32_t len, const uint8_t* ptr) : len(len), ptr(ptr) {} + LargeByteArray(uint64_t len, const uint8_t* ptr) : len(len), ptr(ptr) {} LargeByteArray(::std::string_view view) // NOLINT implicit conversion : LargeByteArray(view.size(), reinterpret_cast(view.data())) {} - uint32_t len; + uint64_t len; const uint8_t* ptr; }; From 0b53b05a1defbe8c5d97668775f099ff654e0f17 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 31 May 2023 13:48:19 -0300 Subject: [PATCH 15/69] add chunked string map test --- .../parquet/arrow/arrow_reader_writer_test.cc | 24 ++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index ad33ca296a283..0196f73e91a92 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -3834,13 +3834,14 @@ TEST(TestImpalaConversion, ArrowTimestampToImpalaTimestamp) { ASSERT_EQ(expected, calculated); } -void TryReadDataFile(const std::string& path, - ::arrow::StatusCode expected_code = ::arrow::StatusCode::OK) { +void TryReadDataFileWithProperties(const std::string& path, + const ArrowReaderProperties& properties, + ::arrow::StatusCode expected_code = ::arrow::StatusCode::OK) { auto pool = ::arrow::default_memory_pool(); std::unique_ptr arrow_reader; Status s = - FileReader::Make(pool, ParquetFileReader::OpenFile(path, false), &arrow_reader); + FileReader::Make(pool, ParquetFileReader::OpenFile(path, false), properties, &arrow_reader); if (s.ok()) { std::shared_ptr<::arrow::Table> table; s = arrow_reader->ReadTable(&table); @@ -3851,6 +3852,11 @@ void TryReadDataFile(const std::string& path, << ", but got " << s.ToString(); } +void TryReadDataFile(const std::string& path, + ::arrow::StatusCode expected_code = ::arrow::StatusCode::OK) { + TryReadDataFileWithProperties(path, default_arrow_reader_properties(), expected_code); +} + TEST(TestArrowReaderAdHoc, Int96BadMemoryAccess) { // PARQUET-995 TryReadDataFile(test::get_data_file("alltypes_plain.parquet")); @@ -3862,6 +3868,18 @@ TEST(TestArrowReaderAdHoc, CorruptedSchema) { TryReadDataFile(path, ::arrow::StatusCode::IOError); } +TEST(TestArrowParquet, LargeByteArray) { + auto path = test::get_data_file("chunked_string_map.parquet"); + + TryReadDataFile(path, ::arrow::StatusCode::NotImplemented); + + auto reader_properties = default_arrow_reader_properties(); + + reader_properties.set_use_binary_large_variants(true); + + TryReadDataFileWithProperties(path, reader_properties); +} + TEST(TestArrowReaderAdHoc, LARGE_MEMORY_TEST(LargeStringColumn)) { // ARROW-3762 ::arrow::StringBuilder builder; From f574e2ecd81d1e1c3ae5e1fdada7b41a8a0d6b87 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 31 May 2023 14:09:12 -0300 Subject: [PATCH 16/69] add boolean comment --- cpp/src/parquet/arrow/reader.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index b95be9cdd1415..2de3c9ab4b006 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -463,7 +463,12 @@ class LeafReader : public ColumnReaderImpl { input_(std::move(input)), descr_(input_->descr()) { record_reader_ = RecordReader::Make( - descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY, false, ctx_->use_binary_large_variants); + descr_, + leaf_info, + ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY, + /*read_dense_for_nullable*/ false, + ctx_->use_binary_large_variants + ); NextRowGroup(); } From 295e062a73ad6a0c6f6a5683f8b71525a0553ae9 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 31 May 2023 14:53:53 -0300 Subject: [PATCH 17/69] Make ChunkedRecordReader generic by using templates --- cpp/src/parquet/column_reader.cc | 87 ++++++++++++++------------------ 1 file changed, 37 insertions(+), 50 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 7666cbff5f104..d936cee827ef9 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -2093,57 +2093,41 @@ class FLBARecordReader : public TypedRecordReader, std::unique_ptr<::arrow::FixedSizeBinaryBuilder> builder_; }; -class ByteArrayChunkedRecordReader : public TypedRecordReader, - virtual public BinaryRecordReader { - public: - ByteArrayChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info, - ::arrow::MemoryPool* pool, bool read_dense_for_nullable) - : TypedRecordReader(descr, leaf_info, pool, - read_dense_for_nullable) { - ARROW_DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY); - accumulator_.builder = std::make_unique<::arrow::BinaryBuilder>(pool); - } - - ::arrow::ArrayVector GetBuilderChunks() override { - ::arrow::ArrayVector result = accumulator_.chunks; - if (result.size() == 0 || accumulator_.builder->length() > 0) { - std::shared_ptr<::arrow::Array> last_chunk; - PARQUET_THROW_NOT_OK(accumulator_.builder->Finish(&last_chunk)); - result.push_back(std::move(last_chunk)); - } - accumulator_.chunks = {}; - return result; - } - - void ReadValuesDense(int64_t values_to_read) override { - int64_t num_decoded = this->current_decoder_->DecodeArrowNonNull( - static_cast(values_to_read), &accumulator_); - CheckNumberDecoded(num_decoded, values_to_read); - ResetValues(); - } - - void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override { - int64_t num_decoded = this->current_decoder_->DecodeArrow( - static_cast(values_to_read), static_cast(null_count), - valid_bits_->mutable_data(), values_written_, &accumulator_); - CheckNumberDecoded(num_decoded, values_to_read - null_count); - ResetValues(); - } - - private: - // Helper data structure for accumulating builder chunks - typename EncodingTraits::Accumulator accumulator_; +// Below concept could be used to simplify type assertion, but it seems like c++20 is not +// available +//template +//concept ByteArrayTypeConcept = std::is_same::value || +// std::is_same::value; + +template +struct IsByteArrayType : std::false_type {}; + +template<> +struct IsByteArrayType : std::true_type {}; + +template<> +struct IsByteArrayType : std::true_type {}; + +template +struct ByteArrayBuilderTypeTrait { + using BuilderType = typename std::conditional::value, + ::arrow::LargeBinaryBuilder, + ::arrow::BinaryBuilder>::type; }; -class LargeByteArrayChunkedRecordReader : public TypedRecordReader, - virtual public LargeBinaryRecordReader { +template +class ChunkedRecordReader : public TypedRecordReader, + virtual public BinaryRecordReader { public: - LargeByteArrayChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info, - ::arrow::MemoryPool* pool, bool read_dense_for_nullable) - : TypedRecordReader(descr, leaf_info, pool, + using BuilderType = typename ByteArrayBuilderTypeTrait::BuilderType; + + ChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info, + ::arrow::MemoryPool* pool, bool read_dense_for_nullable) + : TypedRecordReader(descr, leaf_info, pool, read_dense_for_nullable) { - ARROW_DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY); - accumulator_.builder = std::make_unique<::arrow::LargeBinaryBuilder>(pool); + static_assert(IsByteArrayType::value, "Invalid ByteArrayType"); + ARROW_DCHECK_EQ(TypedRecordReader::descr_->physical_type(), Type::BYTE_ARRAY); + accumulator_.builder = std::make_unique(pool); } ::arrow::ArrayVector GetBuilderChunks() override { @@ -2161,7 +2145,7 @@ class LargeByteArrayChunkedRecordReader : public TypedRecordReadercurrent_decoder_->DecodeArrowNonNull( static_cast(values_to_read), &accumulator_); CheckNumberDecoded(num_decoded, values_to_read); - ResetValues(); + TypedRecordReader::ResetValues(); } void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override { @@ -2169,14 +2153,17 @@ class LargeByteArrayChunkedRecordReader : public TypedRecordReader(values_to_read), static_cast(null_count), valid_bits_->mutable_data(), values_written_, &accumulator_); CheckNumberDecoded(num_decoded, values_to_read - null_count); - ResetValues(); + TypedRecordReader::ResetValues(); } private: // Helper data structure for accumulating builder chunks - typename EncodingTraits::Accumulator accumulator_; + typename EncodingTraits::Accumulator accumulator_; }; +using ByteArrayChunkedRecordReader = ChunkedRecordReader; +using LargeByteArrayChunkedRecordReader = ChunkedRecordReader; + class ByteArrayDictionaryRecordReader : public TypedRecordReader, virtual public DictionaryRecordReader { public: From 25d7815996b48cbc388250e823a6683f5dc1d851 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 31 May 2023 16:26:17 -0300 Subject: [PATCH 18/69] Make ByteArrayDictionaryReader generic with the use of templates --- cpp/src/parquet/column_reader.cc | 105 +++++-------------------------- 1 file changed, 14 insertions(+), 91 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index d936cee827ef9..10a63ede4fd59 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -2164,12 +2164,14 @@ class ChunkedRecordReader : public TypedRecordReader, using ByteArrayChunkedRecordReader = ChunkedRecordReader; using LargeByteArrayChunkedRecordReader = ChunkedRecordReader; -class ByteArrayDictionaryRecordReader : public TypedRecordReader, - virtual public DictionaryRecordReader { + +template +class DictionaryRecordReaderImpl : public TypedRecordReader, + virtual public DictionaryRecordReader { public: - ByteArrayDictionaryRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info, + DictionaryRecordReaderImpl(const ColumnDescriptor* descr, LevelInfo leaf_info, ::arrow::MemoryPool* pool, bool read_dense_for_nullable) - : TypedRecordReader(descr, leaf_info, pool, read_dense_for_nullable), + : TypedRecordReader(descr, leaf_info, pool, read_dense_for_nullable), builder_(pool) { this->read_dictionary_ = true; } @@ -2206,7 +2208,7 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader, void ReadValuesDense(int64_t values_to_read) override { int64_t num_decoded = 0; - if (current_encoding_ == Encoding::RLE_DICTIONARY) { + if (TypedRecordReader::current_encoding_ == Encoding::RLE_DICTIONARY) { MaybeWriteNewDictionary(); auto decoder = dynamic_cast(this->current_decoder_); num_decoded = decoder->DecodeIndices(static_cast(values_to_read), &builder_); @@ -2215,14 +2217,14 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader, static_cast(values_to_read), &builder_); /// Flush values since they have been copied into the builder - ResetValues(); + TypedRecordReader::ResetValues(); } CheckNumberDecoded(num_decoded, values_to_read); } void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override { int64_t num_decoded = 0; - if (current_encoding_ == Encoding::RLE_DICTIONARY) { + if (TypedRecordReader::current_encoding_ == Encoding::RLE_DICTIONARY) { MaybeWriteNewDictionary(); auto decoder = dynamic_cast(this->current_decoder_); num_decoded = decoder->DecodeIndicesSpaced( @@ -2234,99 +2236,20 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader, valid_bits_->mutable_data(), values_written_, &builder_); /// Flush values since they have been copied into the builder - ResetValues(); + TypedRecordReader::ResetValues(); } ARROW_DCHECK_EQ(num_decoded, values_to_read - null_count); } private: - using BinaryDictDecoder = DictDecoder; + using BinaryDictDecoder = DictDecoder; - ::arrow::BinaryDictionary32Builder builder_; + typename EncodingTraits::DictAccumulator builder_; std::vector> result_chunks_; }; -class LargeByteArrayDictionaryRecordReader : public TypedRecordReader, - virtual public DictionaryRecordReader { - public: - LargeByteArrayDictionaryRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info, - ::arrow::MemoryPool* pool, bool read_dense_for_nullable) - : TypedRecordReader(descr, leaf_info, pool, read_dense_for_nullable), - builder_(pool) { - this->read_dictionary_ = true; - } - - std::shared_ptr<::arrow::ChunkedArray> GetResult() override { - FlushBuilder(); - std::vector> result; - std::swap(result, result_chunks_); - return std::make_shared<::arrow::ChunkedArray>(std::move(result), builder_.type()); - } - - void FlushBuilder() { - if (builder_.length() > 0) { - std::shared_ptr<::arrow::Array> chunk; - PARQUET_THROW_NOT_OK(builder_.Finish(&chunk)); - result_chunks_.emplace_back(std::move(chunk)); - - // Also clears the dictionary memo table - builder_.Reset(); - } - } - - void MaybeWriteNewDictionary() { - if (this->new_dictionary_) { - /// If there is a new dictionary, we may need to flush the builder, then - /// insert the new dictionary values - FlushBuilder(); - builder_.ResetFull(); - auto decoder = dynamic_cast(this->current_decoder_); - decoder->InsertDictionary(&builder_); - this->new_dictionary_ = false; - } - } - - void ReadValuesDense(int64_t values_to_read) override { - int64_t num_decoded = 0; - if (current_encoding_ == Encoding::RLE_DICTIONARY) { - MaybeWriteNewDictionary(); - auto decoder = dynamic_cast(this->current_decoder_); - num_decoded = decoder->DecodeIndices(static_cast(values_to_read), &builder_); - } else { - num_decoded = this->current_decoder_->DecodeArrowNonNull( - static_cast(values_to_read), &builder_); - - /// Flush values since they have been copied into the builder - ResetValues(); - } - CheckNumberDecoded(num_decoded, values_to_read); - } - - void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override { - int64_t num_decoded = 0; - if (current_encoding_ == Encoding::RLE_DICTIONARY) { - MaybeWriteNewDictionary(); - auto decoder = dynamic_cast(this->current_decoder_); - num_decoded = decoder->DecodeIndicesSpaced( - static_cast(values_to_read), static_cast(null_count), - valid_bits_->mutable_data(), values_written_, &builder_); - } else { - num_decoded = this->current_decoder_->DecodeArrow( - static_cast(values_to_read), static_cast(null_count), - valid_bits_->mutable_data(), values_written_, &builder_); - - /// Flush values since they have been copied into the builder - ResetValues(); - } - ARROW_DCHECK_EQ(num_decoded, values_to_read - null_count); - } - - private: - using LargeBinaryDictDecoder = DictDecoder; - - ::arrow::LargeBinaryDictionary32Builder builder_; - std::vector> result_chunks_; -}; +using ByteArrayDictionaryRecordReader = DictionaryRecordReaderImpl; +using LargeByteArrayDictionaryRecordReader = DictionaryRecordReaderImpl; // TODO(wesm): Implement these to some satisfaction template <> From fe8d67bf7b4e5e8dfd1f25cda36672d177b9d592 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 31 May 2023 16:55:17 -0300 Subject: [PATCH 19/69] make arrowbinaryhelper generic --- cpp/src/parquet/encoding.cc | 49 ++++++------------------------------- cpp/src/parquet/encoding.h | 12 +++++++-- 2 files changed, 18 insertions(+), 43 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 634ce1496c75e..eee5a914bf7ff 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1271,12 +1271,13 @@ int PlainBooleanDecoder::Decode(bool* buffer, int max_values) { return max_values; } -struct ArrowBinaryHelper { - explicit ArrowBinaryHelper(typename EncodingTraits::Accumulator* out) { +template +struct ArrowBinaryHelperBase { + explicit ArrowBinaryHelperBase(typename EncodingTraits::Accumulator* out) { this->out = out; this->builder = out->builder.get(); this->chunk_space_remaining = - ::arrow::kBinaryMemoryLimit - this->builder->value_data_length(); + EncodingTraits::memory_limit - this->builder->value_data_length(); } Status PushChunk() { @@ -1303,47 +1304,13 @@ struct ArrowBinaryHelper { Status AppendNull() { return builder->AppendNull(); } - typename EncodingTraits::Accumulator* out; - ::arrow::BinaryBuilder* builder; + typename EncodingTraits::Accumulator* out; + typename EncodingTraits::BinaryBuilder* builder; int64_t chunk_space_remaining; }; -struct ArrowLargeBinaryHelper { - explicit ArrowLargeBinaryHelper(typename EncodingTraits::Accumulator* out) { - this->out = out; - this->builder = out->builder.get(); - this->chunk_space_remaining = - ::arrow::kLargeBinaryMemoryLimit - this->builder->value_data_length(); - } - - Status PushChunk() { - std::shared_ptr<::arrow::Array> result; - RETURN_NOT_OK(builder->Finish(&result)); - out->chunks.push_back(result); - chunk_space_remaining = ::arrow::kLargeBinaryMemoryLimit; - return Status::OK(); - } - - bool CanFit(int64_t length) const { return length <= chunk_space_remaining; } - - void UnsafeAppend(const uint8_t* data, int64_t length) { - chunk_space_remaining -= length; - builder->UnsafeAppend(data, length); - } - - void UnsafeAppendNull() { builder->UnsafeAppendNull(); } - - Status Append(const uint8_t* data, int64_t length) { - chunk_space_remaining -= length; - return builder->Append(data, length); - } - - Status AppendNull() { return builder->AppendNull(); } - - typename EncodingTraits::Accumulator* out; - ::arrow::LargeBinaryBuilder* builder; - int64_t chunk_space_remaining; -}; +using ArrowBinaryHelper = ArrowBinaryHelperBase; +using ArrowLargeBinaryHelper = ArrowBinaryHelperBase; template <> inline int PlainDecoder::DecodeArrow( diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index 1218a650238d7..b138c45e40f48 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -24,6 +24,7 @@ #include "arrow/util/spaced.h" +#include "arrow/type.h" #include "parquet/exception.h" #include "parquet/platform.h" #include "parquet/types.h" @@ -142,30 +143,37 @@ template <> struct EncodingTraits { using Encoder = ByteArrayEncoder; using Decoder = ByteArrayDecoder; + using BinaryBuilder = ::arrow::BinaryBuilder; /// \brief Internal helper class for decoding BYTE_ARRAY data where we can /// overflow the capacity of a single arrow::BinaryArray struct Accumulator { - std::unique_ptr<::arrow::BinaryBuilder> builder; + std::unique_ptr builder; std::vector> chunks; }; using ArrowType = ::arrow::BinaryType; using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>; + + + static constexpr auto memory_limit = ::arrow::kBinaryMemoryLimit; }; template <> struct EncodingTraits { using Encoder = LargeByteArrayEncoder; using Decoder = LargeByteArrayDecoder; + using BinaryBuilder = ::arrow::LargeBinaryBuilder; /// \brief Internal helper class for decoding BYTE_ARRAY data where we can /// overflow the capacity of a single arrow::BinaryArray struct Accumulator { - std::unique_ptr<::arrow::LargeBinaryBuilder> builder; + std::unique_ptr builder; std::vector> chunks; }; using ArrowType = ::arrow::LargeBinaryType; using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::LargeBinaryType>; + + static constexpr auto memory_limit = ::arrow::kLargeBinaryMemoryLimit; }; template <> From 35e58356f7c9a5e7dcbc7ac01d83d5772f808467 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 31 May 2023 17:08:39 -0300 Subject: [PATCH 20/69] Make PlainByteArrayDecoder generic --- cpp/src/parquet/encoding.cc | 166 ++++++------------------------------ 1 file changed, 26 insertions(+), 140 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index eee5a914bf7ff..6cd9a833bbff6 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1388,10 +1388,11 @@ inline int PlainDecoder::DecodeArrow( return values_decoded; } -class PlainByteArrayDecoder : public PlainDecoder, - virtual public ByteArrayDecoder { +template +class PlainByteArrayDecoderBase : public PlainDecoder, + virtual public TypedDecoder { public: - using Base = PlainDecoder; + using Base = PlainDecoder; using Base::DecodeSpaced; using Base::PlainDecoder; @@ -1400,7 +1401,7 @@ class PlainByteArrayDecoder : public PlainDecoder, int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - ::arrow::BinaryDictionary32Builder* builder) override { + typename EncodingTraits::DictAccumulator* builder) override { int result = 0; PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, valid_bits_offset, builder, &result)); @@ -1412,7 +1413,7 @@ class PlainByteArrayDecoder : public PlainDecoder, int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - typename EncodingTraits::Accumulator* out) override { + typename EncodingTraits::Accumulator* out) override { int result = 0; PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits, valid_bits_offset, out, &result)); @@ -1422,28 +1423,28 @@ class PlainByteArrayDecoder : public PlainDecoder, private: Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - typename EncodingTraits::Accumulator* out, + typename EncodingTraits::Accumulator* out, int* out_values_decoded) { - ArrowBinaryHelper helper(out); + ArrowBinaryHelperBase helper(out); int values_decoded = 0; RETURN_NOT_OK(helper.builder->Reserve(num_values)); RETURN_NOT_OK(helper.builder->ReserveData( - std::min(len_, helper.chunk_space_remaining))); + std::min(PlainDecoder::len_, helper.chunk_space_remaining))); int i = 0; RETURN_NOT_OK(VisitNullBitmapInline( valid_bits, valid_bits_offset, num_values, null_count, [&]() { - if (ARROW_PREDICT_FALSE(len_ < 4)) { + if (ARROW_PREDICT_FALSE(PlainDecoder::len_ < 4)) { ParquetException::EofException(); } - auto value_len = SafeLoadAs(data_); + auto value_len = SafeLoadAs(PlainDecoder::data_); if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) { return Status::Invalid("Invalid or corrupted value_len '", value_len, "'"); } auto increment = value_len + 4; - if (ARROW_PREDICT_FALSE(len_ < increment)) { + if (ARROW_PREDICT_FALSE(PlainDecoder::len_ < increment)) { ParquetException::EofException(); } if (ARROW_PREDICT_FALSE(!helper.CanFit(value_len))) { @@ -1451,11 +1452,11 @@ class PlainByteArrayDecoder : public PlainDecoder, RETURN_NOT_OK(helper.PushChunk()); RETURN_NOT_OK(helper.builder->Reserve(num_values - i)); RETURN_NOT_OK(helper.builder->ReserveData( - std::min(len_, helper.chunk_space_remaining))); + std::min(PlainDecoder::len_, helper.chunk_space_remaining))); } - helper.UnsafeAppend(data_ + 4, value_len); - data_ += increment; - len_ -= increment; + helper.UnsafeAppend(PlainDecoder::data_ + 4, value_len); + PlainDecoder::data_ += increment; + PlainDecoder::len_ -= increment; ++values_decoded; ++i; return Status::OK(); @@ -1466,7 +1467,7 @@ class PlainByteArrayDecoder : public PlainDecoder, return Status::OK(); })); - num_values_ -= values_decoded; + PlainDecoder::num_values_ -= values_decoded; *out_values_decoded = values_decoded; return Status::OK(); } @@ -1481,148 +1482,33 @@ class PlainByteArrayDecoder : public PlainDecoder, RETURN_NOT_OK(VisitNullBitmapInline( valid_bits, valid_bits_offset, num_values, null_count, [&]() { - if (ARROW_PREDICT_FALSE(len_ < 4)) { + if (ARROW_PREDICT_FALSE(PlainDecoder::len_ < 4)) { ParquetException::EofException(); } - auto value_len = SafeLoadAs(data_); + auto value_len = SafeLoadAs(PlainDecoder::data_); if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) { return Status::Invalid("Invalid or corrupted value_len '", value_len, "'"); } auto increment = value_len + 4; - if (ARROW_PREDICT_FALSE(len_ < increment)) { + if (ARROW_PREDICT_FALSE(PlainDecoder::len_ < increment)) { ParquetException::EofException(); } - RETURN_NOT_OK(builder->Append(data_ + 4, value_len)); - data_ += increment; - len_ -= increment; + RETURN_NOT_OK(builder->Append(PlainDecoder::data_ + 4, value_len)); + PlainDecoder::data_ += increment; + PlainDecoder::len_ -= increment; ++values_decoded; return Status::OK(); }, [&]() { return builder->AppendNull(); })); - num_values_ -= values_decoded; + PlainDecoder::num_values_ -= values_decoded; *out_values_decoded = values_decoded; return Status::OK(); } }; -class PlainLargeByteArrayDecoder : public PlainDecoder, - virtual public LargeByteArrayDecoder { - public: - using Base = PlainDecoder; - using Base::DecodeSpaced; - using Base::PlainDecoder; - - // ---------------------------------------------------------------------- - // Dictionary read paths - - int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, - ::arrow::LargeBinaryDictionary32Builder* builder) override { - int result = 0; - PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, - valid_bits_offset, builder, &result)); - return result; - } - - // ---------------------------------------------------------------------- - // Optimized dense binary read paths - - int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, - typename EncodingTraits::Accumulator* out) override { - int result = 0; - PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits, - valid_bits_offset, out, &result)); - return result; - } - - private: - Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, - typename EncodingTraits::Accumulator* out, - int* out_values_decoded) { - ArrowLargeBinaryHelper helper(out); - int values_decoded = 0; - - RETURN_NOT_OK(helper.builder->Reserve(num_values)); - RETURN_NOT_OK(helper.builder->ReserveData( - std::min(len_, helper.chunk_space_remaining))); - - int i = 0; - RETURN_NOT_OK(VisitNullBitmapInline( - valid_bits, valid_bits_offset, num_values, null_count, - [&]() { - if (ARROW_PREDICT_FALSE(len_ < 4)) { - ParquetException::EofException(); - } - auto value_len = SafeLoadAs(data_); - if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) { - return Status::Invalid("Invalid or corrupted value_len '", value_len, "'"); - } - auto increment = value_len + 4; - if (ARROW_PREDICT_FALSE(len_ < increment)) { - ParquetException::EofException(); - } - if (ARROW_PREDICT_FALSE(!helper.CanFit(value_len))) { - // This element would exceed the capacity of a chunk - RETURN_NOT_OK(helper.PushChunk()); - RETURN_NOT_OK(helper.builder->Reserve(num_values - i)); - RETURN_NOT_OK(helper.builder->ReserveData( - std::min(len_, helper.chunk_space_remaining))); - } - helper.UnsafeAppend(data_ + 4, value_len); - data_ += increment; - len_ -= increment; - ++values_decoded; - ++i; - return Status::OK(); - }, - [&]() { - helper.UnsafeAppendNull(); - ++i; - return Status::OK(); - })); - - num_values_ -= values_decoded; - *out_values_decoded = values_decoded; - return Status::OK(); - } - - template - Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, BuilderType* builder, - int* out_values_decoded) { - RETURN_NOT_OK(builder->Reserve(num_values)); - int values_decoded = 0; - - RETURN_NOT_OK(VisitNullBitmapInline( - valid_bits, valid_bits_offset, num_values, null_count, - [&]() { - if (ARROW_PREDICT_FALSE(len_ < 4)) { - ParquetException::EofException(); - } - auto value_len = SafeLoadAs(data_); - if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) { - return Status::Invalid("Invalid or corrupted value_len '", value_len, "'"); - } - auto increment = value_len + 4; - if (ARROW_PREDICT_FALSE(len_ < increment)) { - ParquetException::EofException(); - } - RETURN_NOT_OK(builder->Append(data_ + 4, value_len)); - data_ += increment; - len_ -= increment; - ++values_decoded; - return Status::OK(); - }, - [&]() { return builder->AppendNull(); })); - - num_values_ -= values_decoded; - *out_values_decoded = values_decoded; - return Status::OK(); - } -}; +using PlainByteArrayDecoder = PlainByteArrayDecoderBase; +using PlainLargeByteArrayDecoder = PlainByteArrayDecoderBase; class PlainFLBADecoder : public PlainDecoder, virtual public FLBADecoder { public: From 9aff2f35cae8bd29721462d231f110b3b7f1edbd Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 1 Jun 2023 09:47:21 -0300 Subject: [PATCH 21/69] remove use_binary_large_variant from parquet reader properties --- cpp/src/parquet/properties.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 26dfcaeb320ad..1a56064a0864e 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -116,12 +116,6 @@ class PARQUET_EXPORT ReaderProperties { page_checksum_verification_ = check_crc; } - bool use_binary_large_variants() const { return use_binary_large_variants_; } - - void set_use_binary_large_variants(bool use_binary_large_variants) { - use_binary_large_variants_ = use_binary_large_variants; - } - private: MemoryPool* pool_; int64_t buffer_size_ = kDefaultBufferSize; From eb850c4b44f7138c2f495cedabec332fdf9db2d3 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 5 Jun 2023 17:27:17 -0300 Subject: [PATCH 22/69] removed parquet::type::large_Byte_array --- cpp/src/parquet/encoding.cc | 21 ++++++++++++-------- cpp/src/parquet/encoding.h | 10 ++++++---- cpp/src/parquet/metadata.cc | 2 -- cpp/src/parquet/page_index.cc | 6 ++---- cpp/src/parquet/stream_reader.cc | 4 +--- cpp/src/parquet/stream_writer.cc | 5 +---- cpp/src/parquet/types.cc | 1 - cpp/src/parquet/types.h | 34 ++++++++++++++++++++------------ 8 files changed, 44 insertions(+), 39 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 6cd9a833bbff6..c1d1cda8f99a1 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3744,7 +3744,7 @@ std::unique_ptr MakeEncoder(Type::type type_num, Encoding::type encodin std::unique_ptr MakeDecoder(Type::type type_num, Encoding::type encoding, const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool) { + ::arrow::MemoryPool* pool, bool use_binary_large_variant) { if (encoding == Encoding::PLAIN) { switch (type_num) { case Type::BOOLEAN: @@ -3760,9 +3760,11 @@ std::unique_ptr MakeDecoder(Type::type type_num, Encoding::type encodin case Type::DOUBLE: return std::make_unique>(descr); case Type::BYTE_ARRAY: - return std::make_unique(descr); - case Type::LARGE_BYTE_ARRAY: - return std::make_unique(descr); + if (use_binary_large_variant) { + return std::make_unique(descr); + } else { + return std::make_unique(descr); + } case Type::FIXED_LEN_BYTE_ARRAY: return std::make_unique(descr); default: @@ -3812,7 +3814,8 @@ std::unique_ptr MakeDecoder(Type::type type_num, Encoding::type encodin namespace detail { std::unique_ptr MakeDictDecoder(Type::type type_num, const ColumnDescriptor* descr, - MemoryPool* pool) { + MemoryPool* pool, + bool use_binary_large_variant) { switch (type_num) { case Type::BOOLEAN: ParquetException::NYI("Dictionary encoding not implemented for boolean type"); @@ -3827,9 +3830,11 @@ std::unique_ptr MakeDictDecoder(Type::type type_num, case Type::DOUBLE: return std::make_unique>(descr, pool); case Type::BYTE_ARRAY: - return std::make_unique(descr, pool); - case Type::LARGE_BYTE_ARRAY: - return std::make_unique(descr, pool); + if (use_binary_large_variant) { + return std::make_unique(descr, pool); + } else { + return std::make_unique(descr, pool); + } case Type::FIXED_LEN_BYTE_ARRAY: return std::make_unique>(descr, pool); default: diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index b138c45e40f48..6ebdd59c35214 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -462,14 +462,15 @@ std::unique_ptr::Encoder> MakeTypedEncoder( PARQUET_EXPORT std::unique_ptr MakeDecoder( Type::type type_num, Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), bool use_binary_large_variant = false); namespace detail { PARQUET_EXPORT std::unique_ptr MakeDictDecoder(Type::type type_num, const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool); + ::arrow::MemoryPool* pool, + bool use_binary_large_variant); } // namespace detail @@ -478,7 +479,7 @@ std::unique_ptr> MakeDictDecoder( const ColumnDescriptor* descr = NULLPTR, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { using OutType = DictDecoder; - auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool); + auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool, std::is_same_v); return std::unique_ptr(dynamic_cast(decoder.release())); } @@ -487,7 +488,8 @@ std::unique_ptr::Decoder> MakeTypedDecoder( Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { using OutType = typename EncodingTraits::Decoder; - std::unique_ptr base = MakeDecoder(DType::type_num, encoding, descr, pool); + + std::unique_ptr base = MakeDecoder(DType::type_num, encoding, descr, pool, std::is_same_v); return std::unique_ptr(dynamic_cast(base.release())); } diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 055e679a9b685..0bbd96580774a 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -123,8 +123,6 @@ std::shared_ptr MakeColumnStats(const format::ColumnMetaData& meta_d return MakeTypedColumnStats(meta_data, descr); case Type::BYTE_ARRAY: return MakeTypedColumnStats(meta_data, descr); - case Type::LARGE_BYTE_ARRAY: - return MakeTypedColumnStats(meta_data, descr); case Type::FIXED_LEN_BYTE_ARRAY: return MakeTypedColumnStats(meta_data, descr); case Type::UNDEFINED: diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index 969db469bbeb5..f3bca027dac5b 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -853,8 +853,7 @@ std::unique_ptr ColumnIndex::Make(const ColumnDescriptor& descr, return std::make_unique>(descr, column_index); case Type::BYTE_ARRAY: return std::make_unique>(descr, column_index); - case Type::LARGE_BYTE_ARRAY: - return std::make_unique>(descr, column_index); + // TODO AP FIX ARTHUR PASSOS case Type::FIXED_LEN_BYTE_ARRAY: return std::make_unique>(descr, column_index); case Type::UNDEFINED: @@ -899,8 +898,7 @@ std::unique_ptr ColumnIndexBuilder::Make( return std::make_unique>(descr); case Type::BYTE_ARRAY: return std::make_unique>(descr); - case Type::LARGE_BYTE_ARRAY: - return std::make_unique>(descr); + // TODO AP FIX ARTHUR PASSOS case Type::FIXED_LEN_BYTE_ARRAY: return std::make_unique>(descr); case Type::UNDEFINED: diff --git a/cpp/src/parquet/stream_reader.cc b/cpp/src/parquet/stream_reader.cc index 66bcf5ca97560..fc22a76ab0ca9 100644 --- a/cpp/src/parquet/stream_reader.cc +++ b/cpp/src/parquet/stream_reader.cc @@ -488,9 +488,7 @@ void StreamReader::SkipRowsInColumn(ColumnReader* reader, int64_t num_rows_to_sk case Type::BYTE_ARRAY: num_skipped = static_cast(reader)->Skip(num_rows_to_skip); break; - case Type::LARGE_BYTE_ARRAY: - num_skipped = static_cast(reader)->Skip(num_rows_to_skip); - break; + // TODO AP FIX ARTHUR PASSOS case Type::FIXED_LEN_BYTE_ARRAY: num_skipped = static_cast(reader)->Skip(num_rows_to_skip); break; diff --git a/cpp/src/parquet/stream_writer.cc b/cpp/src/parquet/stream_writer.cc index d93368740f9a8..e7cf995c4f486 100644 --- a/cpp/src/parquet/stream_writer.cc +++ b/cpp/src/parquet/stream_writer.cc @@ -251,10 +251,7 @@ void StreamWriter::WriteNullValue(ColumnWriter* writer) { static_cast(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero, &kRepLevelZero, nullptr); break; - case Type::LARGE_BYTE_ARRAY: - static_cast(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero, - &kRepLevelZero, nullptr); - break; + // TODO AP FIX ARTHUR PASSOS case Type::FIXED_LEN_BYTE_ARRAY: static_cast(writer)->WriteBatch( kBatchSizeOne, &kDefLevelZero, &kRepLevelZero, nullptr); diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index d5d0442177934..28f472aaf9dd8 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -260,7 +260,6 @@ SortOrder::type DefaultSortOrder(Type::type primitive) { case Type::DOUBLE: return SortOrder::SIGNED; case Type::BYTE_ARRAY: - case Type::LARGE_BYTE_ARRAY: case Type::FIXED_LEN_BYTE_ARRAY: return SortOrder::UNSIGNED; case Type::INT96: diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index cca6247922ce2..972979fa29cb9 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -64,10 +64,6 @@ struct Type { DOUBLE = 5, BYTE_ARRAY = 6, FIXED_LEN_BYTE_ARRAY = 7, - - // This parquet type does not actually exist (AFAIK) and is used to - // create proper type traits - LARGE_BYTE_ARRAY = 8, // Should always be last element. UNDEFINED = 9 }; @@ -768,13 +764,13 @@ struct type_traits { static constexpr const char* printf_code = "s"; }; -template<> -struct type_traits { - using value_type = LargeByteArray; - - static constexpr int value_byte_size = sizeof(LargeByteArray); - static constexpr const char* printf_code = "ls"; -}; +//template<> +//struct type_traits { +// using value_type = LargeByteArray; +// +// static constexpr int value_byte_size = sizeof(LargeByteArray); +// static constexpr const char* printf_code = "ls"; +//}; template <> struct type_traits { @@ -796,8 +792,20 @@ using Int64Type = PhysicalType; using Int96Type = PhysicalType; using FloatType = PhysicalType; using DoubleType = PhysicalType; -using ByteArrayType = PhysicalType; -using LargeByteArrayType = PhysicalType; + +struct ByteArrayType +{ + using c_type = typename type_traits::value_type; + static constexpr Type::type type_num = Type::BYTE_ARRAY; +}; + + +struct LargeByteArrayType +{ + using c_type = typename type_traits::value_type; + static constexpr Type::type type_num = Type::BYTE_ARRAY; +}; + using FLBAType = PhysicalType; template From c2aab6304d8e43a9d92125cd9a05d6e88a37b5f5 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 5 Jun 2023 17:29:12 -0300 Subject: [PATCH 23/69] small adjustment --- cpp/src/parquet/types.h | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 972979fa29cb9..c8eb51ec90f53 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -764,14 +764,6 @@ struct type_traits { static constexpr const char* printf_code = "s"; }; -//template<> -//struct type_traits { -// using value_type = LargeByteArray; -// -// static constexpr int value_byte_size = sizeof(LargeByteArray); -// static constexpr const char* printf_code = "ls"; -//}; - template <> struct type_traits { using value_type = FixedLenByteArray; @@ -792,14 +784,11 @@ using Int64Type = PhysicalType; using Int96Type = PhysicalType; using FloatType = PhysicalType; using DoubleType = PhysicalType; +using ByteArrayType = PhysicalType; -struct ByteArrayType -{ - using c_type = typename type_traits::value_type; - static constexpr Type::type type_num = Type::BYTE_ARRAY; -}; - - +/* + * TODO AP add a comment explaining why the below is needed + * */ struct LargeByteArrayType { using c_type = typename type_traits::value_type; From 837ed6c88ccdbaf5849b5e798ca8d6bc2c6038ca Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 6 Jun 2023 08:13:54 -0300 Subject: [PATCH 24/69] remove largebytearray class --- cpp/src/parquet/column_scanner.h | 8 -------- cpp/src/parquet/encoding.cc | 33 -------------------------------- cpp/src/parquet/types.h | 24 ----------------------- 3 files changed, 65 deletions(-) diff --git a/cpp/src/parquet/column_scanner.h b/cpp/src/parquet/column_scanner.h index 7bea4ca24d6db..d53435f03cd32 100644 --- a/cpp/src/parquet/column_scanner.h +++ b/cpp/src/parquet/column_scanner.h @@ -225,14 +225,6 @@ inline void TypedScanner::FormatValue(void* val, char* buffer, in snprintf(buffer, bufsize, fmt.c_str(), result.c_str()); } -template <> -inline void TypedScanner::FormatValue(void* val, char* buffer, int bufsize, - int width) { - std::string fmt = format_fwf(width); - std::string result = LargeByteArrayToString(*reinterpret_cast(val)); - snprintf(buffer, bufsize, fmt.c_str(), result.c_str()); -} - template <> inline void TypedScanner::FormatValue(void* val, char* buffer, int bufsize, int width) { diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index c1d1cda8f99a1..e620686b30c60 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1126,39 +1126,6 @@ inline int DecodePlain(const uint8_t* data, int64_t data_size, int nu return bytes_decoded; } -static inline int64_t ReadLargeByteArray(const uint8_t* data, int64_t data_size, - LargeByteArray* out) { - if (ARROW_PREDICT_FALSE(data_size < 4)) { - ParquetException::EofException(); - } - const int32_t len = SafeLoadAs(data); - if (len < 0) { - throw ParquetException("Invalid BYTE_ARRAY value"); - } - const int64_t consumed_length = static_cast(len) + 4; - if (ARROW_PREDICT_FALSE(data_size < consumed_length)) { - ParquetException::EofException(); - } - *out = LargeByteArray{static_cast(len), data + 4}; - return consumed_length; -} - -template <> -inline int DecodePlain(const uint8_t* data, int64_t data_size, int num_values, - int type_length, LargeByteArray* out) { - int bytes_decoded = 0; - for (int i = 0; i < num_values; ++i) { - const auto increment = ReadLargeByteArray(data, data_size, out + i); - if (ARROW_PREDICT_FALSE(increment > INT_MAX - bytes_decoded)) { - throw ParquetException("BYTE_ARRAY chunk too large"); - } - data += increment; - data_size -= increment; - bytes_decoded += static_cast(increment); - } - return bytes_decoded; -} - // Template specialization for FIXED_LEN_BYTE_ARRAY. The written values do not // own their own data. template <> diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index c8eb51ec90f53..bb897b5073f5a 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -588,26 +588,6 @@ inline bool operator!=(const ByteArray& left, const ByteArray& right) { return !(left == right); } -struct LargeByteArray { - LargeByteArray() : len(0), ptr(NULLPTR) {} - LargeByteArray(uint64_t len, const uint8_t* ptr) : len(len), ptr(ptr) {} - - LargeByteArray(::std::string_view view) // NOLINT implicit conversion - : LargeByteArray(view.size(), - reinterpret_cast(view.data())) {} - uint64_t len; - const uint8_t* ptr; -}; - -inline bool operator==(const LargeByteArray& left, const LargeByteArray& right) { - return left.len == right.len && - (left.len == 0 || std::memcmp(left.ptr, right.ptr, left.len) == 0); -} - -inline bool operator!=(const LargeByteArray& left, const LargeByteArray& right) { - return !(left == right); -} - struct FixedLenByteArray { FixedLenByteArray() : ptr(NULLPTR) {} explicit FixedLenByteArray(const uint8_t* ptr) : ptr(ptr) {} @@ -642,10 +622,6 @@ static inline std::string ByteArrayToString(const ByteArray& a) { return std::string(reinterpret_cast(a.ptr), a.len); } -static inline std::string LargeByteArrayToString(const LargeByteArray& a) { - return std::string(reinterpret_cast(a.ptr), a.len); -} - static inline void Int96SetNanoSeconds(parquet::Int96& i96, int64_t nanoseconds) { std::memcpy(&i96.value, &nanoseconds, sizeof(nanoseconds)); } From 35cdb993f4c753c9028699ea925a1fc7ae0c44a5 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 6 Jun 2023 08:18:53 -0300 Subject: [PATCH 25/69] simplify largebytearraytype a bit --- cpp/src/parquet/types.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index bb897b5073f5a..6dcc5e082468b 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -765,11 +765,8 @@ using ByteArrayType = PhysicalType; /* * TODO AP add a comment explaining why the below is needed * */ -struct LargeByteArrayType -{ - using c_type = typename type_traits::value_type; - static constexpr Type::type type_num = Type::BYTE_ARRAY; -}; +struct LargeByteArrayType : public ByteArrayType +{}; using FLBAType = PhysicalType; From a5000e17e8509b3269b417005f99aae557687f74 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 6 Jun 2023 10:01:00 -0300 Subject: [PATCH 26/69] simplify dictbytearraydecoderimpl a bit --- cpp/src/parquet/encoding.cc | 576 ++++++++++++------------------------ 1 file changed, 185 insertions(+), 391 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index e620686b30c60..1067f5af0c4bc 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -26,6 +26,7 @@ #include #include #include +#include #include "arrow/array.h" #include "arrow/array/builder_dict.h" @@ -1929,434 +1930,227 @@ void DictDecoderImpl::InsertDictionary(::arrow::ArrayBuilder PARQUET_THROW_NOT_OK(binary_builder->InsertMemoValues(*arr)); } -class DictByteArrayDecoderImpl : public DictDecoderImpl, - virtual public ByteArrayDecoder { - public: - using BASE = DictDecoderImpl; - using BASE::DictDecoderImpl; +template +class DictByteArrayDecoderImpl : public DictDecoderImpl, + virtual public TypedDecoder { - int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, - ::arrow::BinaryDictionary32Builder* builder) override { - int result = 0; - if (null_count == 0) { - PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result)); - } else { - PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, - valid_bits_offset, builder, &result)); - } - return result; - } - - int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, - typename EncodingTraits::Accumulator* out) override { - int result = 0; - if (null_count == 0) { - PARQUET_THROW_NOT_OK(DecodeArrowDenseNonNull(num_values, out, &result)); - } else { - PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits, - valid_bits_offset, out, &result)); - } - return result; - } + public: + using BASE = DictDecoderImpl; + using BASE::DictDecoderImpl; + using BASE::dictionary_; + using BASE::idx_decoder_; + using BASE::IndexInBounds; - private: - Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, - typename EncodingTraits::Accumulator* out, - int* out_num_values) { - constexpr int32_t kBufferSize = 1024; - int32_t indices[kBufferSize]; - - ArrowBinaryHelper helper(out); - - auto dict_values = reinterpret_cast(dictionary_->data()); - int values_decoded = 0; - int num_indices = 0; - int pos_indices = 0; - - auto visit_valid = [&](int64_t position) -> Status { - if (num_indices == pos_indices) { - // Refill indices buffer - const auto batch_size = - std::min(kBufferSize, num_values - null_count - values_decoded); - num_indices = idx_decoder_.GetBatch(indices, batch_size); - if (ARROW_PREDICT_FALSE(num_indices < 1)) { - return Status::Invalid("Invalid number of indices: ", num_indices); - } - pos_indices = 0; - } - const auto index = indices[pos_indices++]; - RETURN_NOT_OK(IndexInBounds(index)); - const auto& val = dict_values[index]; - if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) { - RETURN_NOT_OK(helper.PushChunk()); + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::DictAccumulator* builder) override { + int result = 0; + if (null_count == 0) { + PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result)); + } else { + PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, + valid_bits_offset, builder, &result)); } - RETURN_NOT_OK(helper.Append(val.ptr, static_cast(val.len))); - ++values_decoded; - return Status::OK(); - }; + return result; + } - auto visit_null = [&]() -> Status { - RETURN_NOT_OK(helper.AppendNull()); - return Status::OK(); - }; - - ::arrow::internal::BitBlockCounter bit_blocks(valid_bits, valid_bits_offset, - num_values); - int64_t position = 0; - while (position < num_values) { - const auto block = bit_blocks.NextWord(); - if (block.AllSet()) { - for (int64_t i = 0; i < block.length; ++i, ++position) { - ARROW_RETURN_NOT_OK(visit_valid(position)); - } - } else if (block.NoneSet()) { - for (int64_t i = 0; i < block.length; ++i, ++position) { - ARROW_RETURN_NOT_OK(visit_null()); - } + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::Accumulator* out) override { + int result = 0; + if (null_count == 0) { + PARQUET_THROW_NOT_OK(DecodeArrowDenseNonNull(num_values, out, &result)); } else { - for (int64_t i = 0; i < block.length; ++i, ++position) { - if (bit_util::GetBit(valid_bits, valid_bits_offset + position)) { - ARROW_RETURN_NOT_OK(visit_valid(position)); - } else { - ARROW_RETURN_NOT_OK(visit_null()); + PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits, + valid_bits_offset, out, &result)); + } + return result; + } + + private: + Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::Accumulator* out, + int* out_num_values) { + constexpr int32_t kBufferSize = 1024; + int32_t indices[kBufferSize]; + + ArrowBinaryHelperBase helper(out); + + auto dict_values = reinterpret_cast(dictionary_->data()); + int values_decoded = 0; + int num_indices = 0; + int pos_indices = 0; + + auto visit_valid = [&](int64_t position) -> Status { + if (num_indices == pos_indices) { + // Refill indices buffer + const auto batch_size = + std::min(kBufferSize, num_values - null_count - values_decoded); + num_indices = idx_decoder_.GetBatch(indices, batch_size); + if (ARROW_PREDICT_FALSE(num_indices < 1)) { + return Status::Invalid("Invalid number of indices: ", num_indices); } + pos_indices = 0; } - } - } - - *out_num_values = values_decoded; - return Status::OK(); - } - - Status DecodeArrowDenseNonNull(int num_values, - typename EncodingTraits::Accumulator* out, - int* out_num_values) { - constexpr int32_t kBufferSize = 2048; - int32_t indices[kBufferSize]; - int values_decoded = 0; - - ArrowBinaryHelper helper(out); - auto dict_values = reinterpret_cast(dictionary_->data()); - - while (values_decoded < num_values) { - int32_t batch_size = std::min(kBufferSize, num_values - values_decoded); - int num_indices = idx_decoder_.GetBatch(indices, batch_size); - if (num_indices == 0) ParquetException::EofException(); - for (int i = 0; i < num_indices; ++i) { - auto idx = indices[i]; - RETURN_NOT_OK(IndexInBounds(idx)); - const auto& val = dict_values[idx]; + const auto index = indices[pos_indices++]; + RETURN_NOT_OK(IndexInBounds(index)); + const auto& val = dict_values[index]; if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) { RETURN_NOT_OK(helper.PushChunk()); } RETURN_NOT_OK(helper.Append(val.ptr, static_cast(val.len))); - } - values_decoded += num_indices; - } - *out_num_values = values_decoded; - return Status::OK(); - } - - template - Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, BuilderType* builder, - int* out_num_values) { - constexpr int32_t kBufferSize = 1024; - int32_t indices[kBufferSize]; - - RETURN_NOT_OK(builder->Reserve(num_values)); - ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); - - auto dict_values = reinterpret_cast(dictionary_->data()); - - int values_decoded = 0; - int num_appended = 0; - while (num_appended < num_values) { - bool is_valid = bit_reader.IsSet(); - bit_reader.Next(); - - if (is_valid) { - int32_t batch_size = - std::min(kBufferSize, num_values - num_appended - null_count); - int num_indices = idx_decoder_.GetBatch(indices, batch_size); - - int i = 0; - while (true) { - // Consume all indices - if (is_valid) { - auto idx = indices[i]; - RETURN_NOT_OK(IndexInBounds(idx)); - const auto& val = dict_values[idx]; - RETURN_NOT_OK(builder->Append(val.ptr, val.len)); - ++i; - ++values_decoded; - } else { - RETURN_NOT_OK(builder->AppendNull()); - --null_count; + ++values_decoded; + return Status::OK(); + }; + + auto visit_null = [&]() -> Status { + RETURN_NOT_OK(helper.AppendNull()); + return Status::OK(); + }; + + ::arrow::internal::BitBlockCounter bit_blocks(valid_bits, valid_bits_offset, + num_values); + int64_t position = 0; + while (position < num_values) { + const auto block = bit_blocks.NextWord(); + if (block.AllSet()) { + for (int64_t i = 0; i < block.length; ++i, ++position) { + ARROW_RETURN_NOT_OK(visit_valid(position)); } - ++num_appended; - if (i == num_indices) { - // Do not advance the bit_reader if we have fulfilled the decode - // request - break; + } else if (block.NoneSet()) { + for (int64_t i = 0; i < block.length; ++i, ++position) { + ARROW_RETURN_NOT_OK(visit_null()); + } + } else { + for (int64_t i = 0; i < block.length; ++i, ++position) { + if (bit_util::GetBit(valid_bits, valid_bits_offset + position)) { + ARROW_RETURN_NOT_OK(visit_valid(position)); + } else { + ARROW_RETURN_NOT_OK(visit_null()); + } } - is_valid = bit_reader.IsSet(); - bit_reader.Next(); - } - } else { - RETURN_NOT_OK(builder->AppendNull()); - --null_count; - ++num_appended; - } - } - *out_num_values = values_decoded; - return Status::OK(); - } - - template - Status DecodeArrowNonNull(int num_values, BuilderType* builder, int* out_num_values) { - constexpr int32_t kBufferSize = 2048; - int32_t indices[kBufferSize]; - - RETURN_NOT_OK(builder->Reserve(num_values)); - - auto dict_values = reinterpret_cast(dictionary_->data()); - - int values_decoded = 0; - while (values_decoded < num_values) { - int32_t batch_size = std::min(kBufferSize, num_values - values_decoded); - int num_indices = idx_decoder_.GetBatch(indices, batch_size); - if (num_indices == 0) ParquetException::EofException(); - for (int i = 0; i < num_indices; ++i) { - auto idx = indices[i]; - RETURN_NOT_OK(IndexInBounds(idx)); - const auto& val = dict_values[idx]; - RETURN_NOT_OK(builder->Append(val.ptr, val.len)); - } - values_decoded += num_indices; - } - *out_num_values = values_decoded; - return Status::OK(); - } -}; - -class DictLargeByteArrayDecoderImpl : public DictDecoderImpl, - virtual public LargeByteArrayDecoder { - public: - using BASE = DictDecoderImpl; - using BASE::DictDecoderImpl; - - int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, - ::arrow::LargeBinaryDictionary32Builder* builder) override { - int result = 0; - if (null_count == 0) { - PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result)); - } else { - PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, - valid_bits_offset, builder, &result)); - } - return result; - } - - int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, - typename EncodingTraits::Accumulator* out) override { - int result = 0; - if (null_count == 0) { - PARQUET_THROW_NOT_OK(DecodeArrowDenseNonNull(num_values, out, &result)); - } else { - PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits, - valid_bits_offset, out, &result)); - } - return result; - } - - private: - Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, - typename EncodingTraits::Accumulator* out, - int* out_num_values) { - constexpr int32_t kBufferSize = 1024; - int32_t indices[kBufferSize]; - - ArrowLargeBinaryHelper helper(out); - - auto dict_values = reinterpret_cast(dictionary_->data()); - int values_decoded = 0; - int num_indices = 0; - int pos_indices = 0; - - auto visit_valid = [&](int64_t position) -> Status { - if (num_indices == pos_indices) { - // Refill indices buffer - const auto batch_size = - std::min(kBufferSize, num_values - null_count - values_decoded); - num_indices = idx_decoder_.GetBatch(indices, batch_size); - if (ARROW_PREDICT_FALSE(num_indices < 1)) { - return Status::Invalid("Invalid number of indices: ", num_indices); } - pos_indices = 0; - } - const auto index = indices[pos_indices++]; - RETURN_NOT_OK(IndexInBounds(index)); - const auto& val = dict_values[index]; - if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) { - RETURN_NOT_OK(helper.PushChunk()); } - RETURN_NOT_OK(helper.Append(val.ptr, static_cast(val.len))); - ++values_decoded; - return Status::OK(); - }; - auto visit_null = [&]() -> Status { - RETURN_NOT_OK(helper.AppendNull()); + *out_num_values = values_decoded; return Status::OK(); - }; - - ::arrow::internal::BitBlockCounter bit_blocks(valid_bits, valid_bits_offset, - num_values); - int64_t position = 0; - while (position < num_values) { - const auto block = bit_blocks.NextWord(); - if (block.AllSet()) { - for (int64_t i = 0; i < block.length; ++i, ++position) { - ARROW_RETURN_NOT_OK(visit_valid(position)); - } - } else if (block.NoneSet()) { - for (int64_t i = 0; i < block.length; ++i, ++position) { - ARROW_RETURN_NOT_OK(visit_null()); - } - } else { - for (int64_t i = 0; i < block.length; ++i, ++position) { - if (bit_util::GetBit(valid_bits, valid_bits_offset + position)) { - ARROW_RETURN_NOT_OK(visit_valid(position)); - } else { - ARROW_RETURN_NOT_OK(visit_null()); - } - } - } } - *out_num_values = values_decoded; - return Status::OK(); - } + Status DecodeArrowDenseNonNull(int num_values, + typename EncodingTraits::Accumulator* out, + int* out_num_values) { + constexpr int32_t kBufferSize = 2048; + int32_t indices[kBufferSize]; + int values_decoded = 0; - Status DecodeArrowDenseNonNull(int num_values, - typename EncodingTraits::Accumulator* out, - int* out_num_values) { - constexpr int32_t kBufferSize = 2048; - int32_t indices[kBufferSize]; - int values_decoded = 0; + ArrowBinaryHelperBase helper(out); + auto dict_values = reinterpret_cast(dictionary_->data()); - ArrowLargeBinaryHelper helper(out); - auto dict_values = reinterpret_cast(dictionary_->data()); - - while (values_decoded < num_values) { - int32_t batch_size = std::min(kBufferSize, num_values - values_decoded); - int num_indices = idx_decoder_.GetBatch(indices, batch_size); - if (num_indices == 0) ParquetException::EofException(); - for (int i = 0; i < num_indices; ++i) { - auto idx = indices[i]; - RETURN_NOT_OK(IndexInBounds(idx)); - const auto& val = dict_values[idx]; - if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) { - RETURN_NOT_OK(helper.PushChunk()); + while (values_decoded < num_values) { + int32_t batch_size = std::min(kBufferSize, num_values - values_decoded); + int num_indices = idx_decoder_.GetBatch(indices, batch_size); + if (num_indices == 0) ParquetException::EofException(); + for (int i = 0; i < num_indices; ++i) { + auto idx = indices[i]; + RETURN_NOT_OK(IndexInBounds(idx)); + const auto& val = dict_values[idx]; + if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) { + RETURN_NOT_OK(helper.PushChunk()); + } + RETURN_NOT_OK(helper.Append(val.ptr, static_cast(val.len))); } - RETURN_NOT_OK(helper.Append(val.ptr, static_cast(val.len))); + values_decoded += num_indices; } - values_decoded += num_indices; + *out_num_values = values_decoded; + return Status::OK(); } - *out_num_values = values_decoded; - return Status::OK(); - } - - template - Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, BuilderType* builder, - int* out_num_values) { - constexpr int32_t kBufferSize = 1024; - int32_t indices[kBufferSize]; - - RETURN_NOT_OK(builder->Reserve(num_values)); - ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); - - auto dict_values = reinterpret_cast(dictionary_->data()); - - int values_decoded = 0; - int num_appended = 0; - while (num_appended < num_values) { - bool is_valid = bit_reader.IsSet(); - bit_reader.Next(); - - if (is_valid) { - int32_t batch_size = - std::min(kBufferSize, num_values - num_appended - null_count); - int num_indices = idx_decoder_.GetBatch(indices, batch_size); - int i = 0; - while (true) { - // Consume all indices - if (is_valid) { - auto idx = indices[i]; - RETURN_NOT_OK(IndexInBounds(idx)); - const auto& val = dict_values[idx]; - RETURN_NOT_OK(builder->Append(val.ptr, val.len)); - ++i; - ++values_decoded; - } else { - RETURN_NOT_OK(builder->AppendNull()); - --null_count; + template + Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, BuilderType* builder, + int* out_num_values) { + constexpr int32_t kBufferSize = 1024; + int32_t indices[kBufferSize]; + + RETURN_NOT_OK(builder->Reserve(num_values)); + ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); + + auto dict_values = reinterpret_cast(dictionary_->data()); + + int values_decoded = 0; + int num_appended = 0; + while (num_appended < num_values) { + bool is_valid = bit_reader.IsSet(); + bit_reader.Next(); + + if (is_valid) { + int32_t batch_size = + std::min(kBufferSize, num_values - num_appended - null_count); + int num_indices = idx_decoder_.GetBatch(indices, batch_size); + + int i = 0; + while (true) { + // Consume all indices + if (is_valid) { + auto idx = indices[i]; + RETURN_NOT_OK(IndexInBounds(idx)); + const auto& val = dict_values[idx]; + RETURN_NOT_OK(builder->Append(val.ptr, val.len)); + ++i; + ++values_decoded; + } else { + RETURN_NOT_OK(builder->AppendNull()); + --null_count; + } + ++num_appended; + if (i == num_indices) { + // Do not advance the bit_reader if we have fulfilled the decode + // request + break; + } + is_valid = bit_reader.IsSet(); + bit_reader.Next(); } + } else { + RETURN_NOT_OK(builder->AppendNull()); + --null_count; ++num_appended; - if (i == num_indices) { - // Do not advance the bit_reader if we have fulfilled the decode - // request - break; - } - is_valid = bit_reader.IsSet(); - bit_reader.Next(); } - } else { - RETURN_NOT_OK(builder->AppendNull()); - --null_count; - ++num_appended; } + *out_num_values = values_decoded; + return Status::OK(); } - *out_num_values = values_decoded; - return Status::OK(); - } - template - Status DecodeArrowNonNull(int num_values, BuilderType* builder, int* out_num_values) { - constexpr int32_t kBufferSize = 2048; - int32_t indices[kBufferSize]; + template + Status DecodeArrowNonNull(int num_values, BuilderType* builder, int* out_num_values) { + constexpr int32_t kBufferSize = 2048; + int32_t indices[kBufferSize]; - RETURN_NOT_OK(builder->Reserve(num_values)); + RETURN_NOT_OK(builder->Reserve(num_values)); - auto dict_values = reinterpret_cast(dictionary_->data()); + auto dict_values = reinterpret_cast(dictionary_->data()); - int values_decoded = 0; - while (values_decoded < num_values) { - int32_t batch_size = std::min(kBufferSize, num_values - values_decoded); - int num_indices = idx_decoder_.GetBatch(indices, batch_size); - if (num_indices == 0) ParquetException::EofException(); - for (int i = 0; i < num_indices; ++i) { - auto idx = indices[i]; - RETURN_NOT_OK(IndexInBounds(idx)); - const auto& val = dict_values[idx]; - RETURN_NOT_OK(builder->Append(val.ptr, val.len)); + int values_decoded = 0; + while (values_decoded < num_values) { + int32_t batch_size = std::min(kBufferSize, num_values - values_decoded); + int num_indices = idx_decoder_.GetBatch(indices, batch_size); + if (num_indices == 0) ParquetException::EofException(); + for (int i = 0; i < num_indices; ++i) { + auto idx = indices[i]; + RETURN_NOT_OK(IndexInBounds(idx)); + const auto& val = dict_values[idx]; + RETURN_NOT_OK(builder->Append(val.ptr, val.len)); + } + values_decoded += num_indices; } - values_decoded += num_indices; + *out_num_values = values_decoded; + return Status::OK(); } - *out_num_values = values_decoded; - return Status::OK(); - } }; +using DictLargeByteArrayDecoderImpl = DictByteArrayDecoderImpl; + // ---------------------------------------------------------------------- // DeltaBitPackEncoder @@ -3800,7 +3594,7 @@ std::unique_ptr MakeDictDecoder(Type::type type_num, if (use_binary_large_variant) { return std::make_unique(descr, pool); } else { - return std::make_unique(descr, pool); + return std::make_unique>(descr, pool); } case Type::FIXED_LEN_BYTE_ARRAY: return std::make_unique>(descr, pool); From eb71c17fa906733256ad6eb2cf30bc8415d87b23 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 6 Jun 2023 10:11:04 -0300 Subject: [PATCH 27/69] remove one default argument --- cpp/src/parquet/arrow/schema_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/schema_internal.h b/cpp/src/parquet/arrow/schema_internal.h index d27440ea22301..2173e4ea18ad0 100644 --- a/cpp/src/parquet/arrow/schema_internal.h +++ b/cpp/src/parquet/arrow/schema_internal.h @@ -40,7 +40,7 @@ Result> FromInt64(const LogicalType& logical_ Result> GetArrowType(Type::type physical_type, const LogicalType& logical_type, int type_length, - bool use_binary_large_variant = false); + bool use_binary_large_variant); Result> GetArrowType( Type::type physical_type, const LogicalType& logical_type, int type_length, From 686a3f7d9994e3112955d53a119bd73f265b3372 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 6 Jun 2023 10:15:08 -0300 Subject: [PATCH 28/69] remove junk code --- cpp/src/parquet/arrow/reader_internal.cc | 1 - cpp/src/parquet/column_reader.h | 6 ------ cpp/src/parquet/column_writer.h | 1 - cpp/src/parquet/encoding.cc | 1 - 4 files changed, 9 deletions(-) diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index b9c913bc24291..a294b712a7ce3 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -85,7 +85,6 @@ using ::arrow::internal::SafeLeftShift; using ::arrow::util::SafeLoadAs; using parquet::internal::BinaryRecordReader; -using parquet::internal::LargeBinaryRecordReader; using parquet::internal::DictionaryRecordReader; using parquet::internal::RecordReader; using parquet::schema::GroupNode; diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 2c6dfea9d39a1..471117c1f13e6 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -471,11 +471,6 @@ class BinaryRecordReader : virtual public RecordReader { virtual std::vector> GetBuilderChunks() = 0; }; -class LargeBinaryRecordReader : virtual public BinaryRecordReader { - public: - virtual std::vector> GetBuilderChunks() = 0; -}; - /// \brief Read records directly to dictionary-encoded Arrow form (int32 /// indices). Only valid for BYTE_ARRAY columns class DictionaryRecordReader : virtual public RecordReader { @@ -492,7 +487,6 @@ using Int96Reader = TypedColumnReader; using FloatReader = TypedColumnReader; using DoubleReader = TypedColumnReader; using ByteArrayReader = TypedColumnReader; -using LargeByteArrayReader = TypedColumnReader; using FixedLenByteArrayReader = TypedColumnReader; } // namespace parquet diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 545ecbb6732f8..792b108ac8835 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -233,7 +233,6 @@ using Int96Writer = TypedColumnWriter; using FloatWriter = TypedColumnWriter; using DoubleWriter = TypedColumnWriter; using ByteArrayWriter = TypedColumnWriter; -using LargeByteArrayWriter = TypedColumnWriter; using FixedLenByteArrayWriter = TypedColumnWriter; namespace internal { diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 1067f5af0c4bc..8ea91d4e33794 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -26,7 +26,6 @@ #include #include #include -#include #include "arrow/array.h" #include "arrow/array/builder_dict.h" From a61fc32cdfbb420f3506589f649b4922d067ff9d Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 6 Jun 2023 10:20:46 -0300 Subject: [PATCH 29/69] move use_binary_large_variant check inside frombytearray --- cpp/src/parquet/arrow/schema_internal.cc | 26 +++++------------------- cpp/src/parquet/arrow/schema_internal.h | 4 ++-- cpp/src/parquet/types.h | 2 +- 3 files changed, 8 insertions(+), 24 deletions(-) diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc index a256ec4a6d7f9..a971f334dccb2 100644 --- a/cpp/src/parquet/arrow/schema_internal.cc +++ b/cpp/src/parquet/arrow/schema_internal.cc @@ -110,34 +110,18 @@ Result> MakeArrowTimestamp(const LogicalType& logical } } -Result> FromByteArray(const LogicalType& logical_type) { +Result> FromByteArray(const LogicalType& logical_type, + bool use_binary_large_variant) { switch (logical_type.type()) { case LogicalType::Type::STRING: - return ::arrow::utf8(); + return use_binary_large_variant ? ::arrow::large_utf8() : ::arrow::utf8(); case LogicalType::Type::DECIMAL: return MakeArrowDecimal(logical_type); case LogicalType::Type::NONE: case LogicalType::Type::ENUM: case LogicalType::Type::JSON: case LogicalType::Type::BSON: - return ::arrow::binary(); - default: - return Status::NotImplemented("Unhandled logical logical_type ", - logical_type.ToString(), " for binary array"); - } -} - -Result> FromLargeByteArray(const LogicalType& logical_type) { - switch (logical_type.type()) { - case LogicalType::Type::STRING: - return ::arrow::large_utf8(); - case LogicalType::Type::DECIMAL: - return MakeArrowDecimal(logical_type); - case LogicalType::Type::NONE: - case LogicalType::Type::ENUM: - case LogicalType::Type::JSON: - case LogicalType::Type::BSON: - return ::arrow::large_binary(); + return use_binary_large_variant ? ::arrow::large_binary() : ::arrow::binary(); default: return Status::NotImplemented("Unhandled logical logical_type ", logical_type.ToString(), " for binary array"); @@ -217,7 +201,7 @@ Result> GetArrowType( case ParquetType::DOUBLE: return ::arrow::float64(); case ParquetType::BYTE_ARRAY: - return use_binary_large_variant ? FromLargeByteArray(logical_type) : FromByteArray(logical_type); + return FromByteArray(logical_type, use_binary_large_variant); case ParquetType::FIXED_LEN_BYTE_ARRAY: return FromFLBA(logical_type, type_length); default: { diff --git a/cpp/src/parquet/arrow/schema_internal.h b/cpp/src/parquet/arrow/schema_internal.h index 2173e4ea18ad0..9bcebc49d3b96 100644 --- a/cpp/src/parquet/arrow/schema_internal.h +++ b/cpp/src/parquet/arrow/schema_internal.h @@ -29,8 +29,8 @@ namespace arrow { using ::arrow::Result; -Result> FromByteArray(const LogicalType& logical_type); -Result> FromLargeByteArray(const LogicalType& logical_type); +Result> FromByteArray(const LogicalType& logical_type, + bool use_binary_large_variant); Result> FromFLBA(const LogicalType& logical_type, int32_t physical_length); diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 6dcc5e082468b..41bf6c903e3e4 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -65,7 +65,7 @@ struct Type { BYTE_ARRAY = 6, FIXED_LEN_BYTE_ARRAY = 7, // Should always be last element. - UNDEFINED = 9 + UNDEFINED }; }; From e2600d0620345afad064a8da17fce0015e35c022 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 6 Jun 2023 10:36:25 -0300 Subject: [PATCH 30/69] simplify chunkedrecordreader a bit --- cpp/src/parquet/column_reader.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 10a63ede4fd59..e5910e2d79b14 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -2119,6 +2119,9 @@ template class ChunkedRecordReader : public TypedRecordReader, virtual public BinaryRecordReader { public: + using BASE = TypedRecordReader; + using BASE::descr_; + using BASE::ResetValues; using BuilderType = typename ByteArrayBuilderTypeTrait::BuilderType; ChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info, @@ -2126,7 +2129,7 @@ class ChunkedRecordReader : public TypedRecordReader, : TypedRecordReader(descr, leaf_info, pool, read_dense_for_nullable) { static_assert(IsByteArrayType::value, "Invalid ByteArrayType"); - ARROW_DCHECK_EQ(TypedRecordReader::descr_->physical_type(), Type::BYTE_ARRAY); + ARROW_DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY); accumulator_.builder = std::make_unique(pool); } @@ -2145,7 +2148,7 @@ class ChunkedRecordReader : public TypedRecordReader, int64_t num_decoded = this->current_decoder_->DecodeArrowNonNull( static_cast(values_to_read), &accumulator_); CheckNumberDecoded(num_decoded, values_to_read); - TypedRecordReader::ResetValues(); + ResetValues(); } void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override { @@ -2153,7 +2156,7 @@ class ChunkedRecordReader : public TypedRecordReader, static_cast(values_to_read), static_cast(null_count), valid_bits_->mutable_data(), values_written_, &accumulator_); CheckNumberDecoded(num_decoded, values_to_read - null_count); - TypedRecordReader::ResetValues(); + ResetValues(); } private: From 3b86e23e4bbc1079b78f901b1415e3c7aeea432a Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 6 Jun 2023 11:05:26 -0300 Subject: [PATCH 31/69] simplify DictionaryRecordReaderImpl and fix DebugPrintState --- cpp/src/parquet/column_reader.cc | 69 ++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 31 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index e5910e2d79b14..7dd31632ba14d 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -1988,33 +1988,33 @@ class TypedRecordReader : public TypedColumnReaderImpl, } void DebugPrintState() override { -// const int16_t* def_levels = this->def_levels(); -// const int16_t* rep_levels = this->rep_levels(); -// const int64_t total_levels_read = levels_position_; -// -// const T* vals = reinterpret_cast(this->values()); -// -// if (leaf_info_.def_level > 0) { -// std::cout << "def levels: "; -// for (int64_t i = 0; i < total_levels_read; ++i) { -// std::cout << def_levels[i] << " "; -// } -// std::cout << std::endl; -// } -// -// if (leaf_info_.rep_level > 0) { -// std::cout << "rep levels: "; -// for (int64_t i = 0; i < total_levels_read; ++i) { -// std::cout << rep_levels[i] << " "; -// } -// std::cout << std::endl; -// } -// -// std::cout << "values: "; -// for (int64_t i = 0; i < this->values_written(); ++i) { -//// std::cout << vals[i] << " "; -// } -// std::cout << std::endl; + const int16_t* def_levels = this->def_levels(); + const int16_t* rep_levels = this->rep_levels(); + const int64_t total_levels_read = levels_position_; + + const T* vals = reinterpret_cast(this->values()); + + if (leaf_info_.def_level > 0) { + std::cout << "def levels: "; + for (int64_t i = 0; i < total_levels_read; ++i) { + std::cout << def_levels[i] << " "; + } + std::cout << std::endl; + } + + if (leaf_info_.rep_level > 0) { + std::cout << "rep levels: "; + for (int64_t i = 0; i < total_levels_read; ++i) { + std::cout << rep_levels[i] << " "; + } + std::cout << std::endl; + } + + std::cout << "values: "; + for (int64_t i = 0; i < this->values_written(); ++i) { + std::cout << vals[i] << " "; + } + std::cout << std::endl; } void ResetValues() { @@ -2171,6 +2171,10 @@ using LargeByteArrayChunkedRecordReader = ChunkedRecordReader class DictionaryRecordReaderImpl : public TypedRecordReader, virtual public DictionaryRecordReader { + using BASE = TypedRecordReader; + using BASE::current_encoding_; + using BASE::ResetValues; + public: DictionaryRecordReaderImpl(const ColumnDescriptor* descr, LevelInfo leaf_info, ::arrow::MemoryPool* pool, bool read_dense_for_nullable) @@ -2211,7 +2215,7 @@ class DictionaryRecordReaderImpl : public TypedRecordReader, void ReadValuesDense(int64_t values_to_read) override { int64_t num_decoded = 0; - if (TypedRecordReader::current_encoding_ == Encoding::RLE_DICTIONARY) { + if (current_encoding_ == Encoding::RLE_DICTIONARY) { MaybeWriteNewDictionary(); auto decoder = dynamic_cast(this->current_decoder_); num_decoded = decoder->DecodeIndices(static_cast(values_to_read), &builder_); @@ -2220,14 +2224,14 @@ class DictionaryRecordReaderImpl : public TypedRecordReader, static_cast(values_to_read), &builder_); /// Flush values since they have been copied into the builder - TypedRecordReader::ResetValues(); + ResetValues(); } CheckNumberDecoded(num_decoded, values_to_read); } void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override { int64_t num_decoded = 0; - if (TypedRecordReader::current_encoding_ == Encoding::RLE_DICTIONARY) { + if (current_encoding_ == Encoding::RLE_DICTIONARY) { MaybeWriteNewDictionary(); auto decoder = dynamic_cast(this->current_decoder_); num_decoded = decoder->DecodeIndicesSpaced( @@ -2239,7 +2243,7 @@ class DictionaryRecordReaderImpl : public TypedRecordReader, valid_bits_->mutable_data(), values_written_, &builder_); /// Flush values since they have been copied into the builder - TypedRecordReader::ResetValues(); + ResetValues(); } ARROW_DCHECK_EQ(num_decoded, values_to_read - null_count); } @@ -2261,6 +2265,9 @@ void TypedRecordReader::DebugPrintState() {} template <> void TypedRecordReader::DebugPrintState() {} +template <> +void TypedRecordReader::DebugPrintState() {} + template <> void TypedRecordReader::DebugPrintState() {} From cc027b7a152dfcbc5cac25647ba4ed398c502766 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 6 Jun 2023 11:08:04 -0300 Subject: [PATCH 32/69] simplify PlainByteArrayDecoderBase --- cpp/src/parquet/encoding.cc | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 8ea91d4e33794..b49c351b7403a 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1360,6 +1360,9 @@ class PlainByteArrayDecoderBase : public PlainDecoder, virtual public TypedDecoder { public: using Base = PlainDecoder; + using Base::len_; + using Base::data_; + using Base::num_values_; using Base::DecodeSpaced; using Base::PlainDecoder; @@ -1397,21 +1400,21 @@ class PlainByteArrayDecoderBase : public PlainDecoder, RETURN_NOT_OK(helper.builder->Reserve(num_values)); RETURN_NOT_OK(helper.builder->ReserveData( - std::min(PlainDecoder::len_, helper.chunk_space_remaining))); + std::min(len_, helper.chunk_space_remaining))); int i = 0; RETURN_NOT_OK(VisitNullBitmapInline( valid_bits, valid_bits_offset, num_values, null_count, [&]() { - if (ARROW_PREDICT_FALSE(PlainDecoder::len_ < 4)) { + if (ARROW_PREDICT_FALSE(len_ < 4)) { ParquetException::EofException(); } - auto value_len = SafeLoadAs(PlainDecoder::data_); + auto value_len = SafeLoadAs(data_); if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) { return Status::Invalid("Invalid or corrupted value_len '", value_len, "'"); } auto increment = value_len + 4; - if (ARROW_PREDICT_FALSE(PlainDecoder::len_ < increment)) { + if (ARROW_PREDICT_FALSE(len_ < increment)) { ParquetException::EofException(); } if (ARROW_PREDICT_FALSE(!helper.CanFit(value_len))) { @@ -1419,11 +1422,11 @@ class PlainByteArrayDecoderBase : public PlainDecoder, RETURN_NOT_OK(helper.PushChunk()); RETURN_NOT_OK(helper.builder->Reserve(num_values - i)); RETURN_NOT_OK(helper.builder->ReserveData( - std::min(PlainDecoder::len_, helper.chunk_space_remaining))); + std::min(len_, helper.chunk_space_remaining))); } - helper.UnsafeAppend(PlainDecoder::data_ + 4, value_len); - PlainDecoder::data_ += increment; - PlainDecoder::len_ -= increment; + helper.UnsafeAppend(data_ + 4, value_len); + data_ += increment; + len_ -= increment; ++values_decoded; ++i; return Status::OK(); @@ -1434,7 +1437,7 @@ class PlainByteArrayDecoderBase : public PlainDecoder, return Status::OK(); })); - PlainDecoder::num_values_ -= values_decoded; + num_values_ -= values_decoded; *out_values_decoded = values_decoded; return Status::OK(); } @@ -1449,26 +1452,26 @@ class PlainByteArrayDecoderBase : public PlainDecoder, RETURN_NOT_OK(VisitNullBitmapInline( valid_bits, valid_bits_offset, num_values, null_count, [&]() { - if (ARROW_PREDICT_FALSE(PlainDecoder::len_ < 4)) { + if (ARROW_PREDICT_FALSE(len_ < 4)) { ParquetException::EofException(); } - auto value_len = SafeLoadAs(PlainDecoder::data_); + auto value_len = SafeLoadAs(data_); if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) { return Status::Invalid("Invalid or corrupted value_len '", value_len, "'"); } auto increment = value_len + 4; - if (ARROW_PREDICT_FALSE(PlainDecoder::len_ < increment)) { + if (ARROW_PREDICT_FALSE(len_ < increment)) { ParquetException::EofException(); } - RETURN_NOT_OK(builder->Append(PlainDecoder::data_ + 4, value_len)); - PlainDecoder::data_ += increment; - PlainDecoder::len_ -= increment; + RETURN_NOT_OK(builder->Append(data_ + 4, value_len)); + data_ += increment; + len_ -= increment; ++values_decoded; return Status::OK(); }, [&]() { return builder->AppendNull(); })); - PlainDecoder::num_values_ -= values_decoded; + num_values_ -= values_decoded; *out_values_decoded = values_decoded; return Status::OK(); } From 177db7af52dc51563fb04d3669ed2a49ee0d67d8 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 7 Jun 2023 11:01:34 -0300 Subject: [PATCH 33/69] remove some todos --- cpp/src/parquet/stream_reader.cc | 1 - cpp/src/parquet/stream_writer.cc | 1 - 2 files changed, 2 deletions(-) diff --git a/cpp/src/parquet/stream_reader.cc b/cpp/src/parquet/stream_reader.cc index fc22a76ab0ca9..0fecb1bf24615 100644 --- a/cpp/src/parquet/stream_reader.cc +++ b/cpp/src/parquet/stream_reader.cc @@ -488,7 +488,6 @@ void StreamReader::SkipRowsInColumn(ColumnReader* reader, int64_t num_rows_to_sk case Type::BYTE_ARRAY: num_skipped = static_cast(reader)->Skip(num_rows_to_skip); break; - // TODO AP FIX ARTHUR PASSOS case Type::FIXED_LEN_BYTE_ARRAY: num_skipped = static_cast(reader)->Skip(num_rows_to_skip); break; diff --git a/cpp/src/parquet/stream_writer.cc b/cpp/src/parquet/stream_writer.cc index e7cf995c4f486..856436d701816 100644 --- a/cpp/src/parquet/stream_writer.cc +++ b/cpp/src/parquet/stream_writer.cc @@ -251,7 +251,6 @@ void StreamWriter::WriteNullValue(ColumnWriter* writer) { static_cast(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero, &kRepLevelZero, nullptr); break; - // TODO AP FIX ARTHUR PASSOS case Type::FIXED_LEN_BYTE_ARRAY: static_cast(writer)->WriteBatch( kBatchSizeOne, &kDefLevelZero, &kRepLevelZero, nullptr); From 66223ee1008f5aaa78fd409fea68d96eb65890ed Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 7 Jun 2023 15:47:31 -0300 Subject: [PATCH 34/69] Add comment explaining why struct LargeByteArrayType instead of alias --- cpp/src/parquet/types.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 41bf6c903e3e4..f3429746d3ca4 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -763,7 +763,11 @@ using DoubleType = PhysicalType; using ByteArrayType = PhysicalType; /* - * TODO AP add a comment explaining why the below is needed + * Parquet does not have a LARGE_BYTE_ARRAY_TYPE, but arrow does. + * It is used to store ByteArrays with length > 2^31 - 1. + * The below LargeByteArrayType is used by other classes to select the proper + * Readers/Writers/Builders/Encoders/Decoders by using the templated EncodingTraits. + * Since there is not a parquet equivalent, a struct has to be used as a workaround * */ struct LargeByteArrayType : public ByteArrayType {}; From 5cd39d8f629c6d3554e91d212d6a12ad319f0266 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 8 Jun 2023 10:01:37 -0300 Subject: [PATCH 35/69] address some pr comments --- .../parquet/parquet_arrow/reader_writer.cc | 2 +- .../parquet/arrow/arrow_reader_writer_test.cc | 6 +--- cpp/src/parquet/arrow/reader.cc | 5 ++- cpp/src/parquet/arrow/reader_internal.h | 2 +- cpp/src/parquet/arrow/schema.cc | 3 +- cpp/src/parquet/arrow/schema_internal.cc | 14 ++++---- cpp/src/parquet/arrow/schema_internal.h | 8 ++--- cpp/src/parquet/column_reader.cc | 33 +++++++++---------- cpp/src/parquet/encoding.cc | 10 +++--- cpp/src/parquet/encoding.h | 4 +-- cpp/src/parquet/properties.h | 13 ++++---- cpp/src/parquet/types.h | 2 +- 12 files changed, 48 insertions(+), 54 deletions(-) diff --git a/cpp/examples/parquet/parquet_arrow/reader_writer.cc b/cpp/examples/parquet/parquet_arrow/reader_writer.cc index debf62736bdd0..f5d96ec16ca64 100644 --- a/cpp/examples/parquet/parquet_arrow/reader_writer.cc +++ b/cpp/examples/parquet/parquet_arrow/reader_writer.cc @@ -137,4 +137,4 @@ int main(int argc, char** argv) { read_single_rowgroup(); read_single_column(); read_single_column_chunk(); -} \ No newline at end of file +} diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 0196f73e91a92..a98e0b321be0b 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -3870,13 +3870,9 @@ TEST(TestArrowReaderAdHoc, CorruptedSchema) { TEST(TestArrowParquet, LargeByteArray) { auto path = test::get_data_file("chunked_string_map.parquet"); - TryReadDataFile(path, ::arrow::StatusCode::NotImplemented); - auto reader_properties = default_arrow_reader_properties(); - - reader_properties.set_use_binary_large_variants(true); - + reader_properties.set_use_large_binary_variants(true); TryReadDataFileWithProperties(path, reader_properties); } diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 2de3c9ab4b006..b163eaa4850c0 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -219,7 +219,7 @@ class FileReaderImpl : public FileReader { ctx->iterator_factory = SomeRowGroupsFactory(row_groups); ctx->filter_leaves = true; ctx->included_leaves = included_leaves; - ctx->use_binary_large_variants = reader_properties_.use_binary_large_variants(); + ctx->use_large_binary_variants = reader_properties_.use_large_binary_variants(); return GetReader(manifest_.schema_fields[i], ctx, out); } @@ -467,8 +467,7 @@ class LeafReader : public ColumnReaderImpl { leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY, /*read_dense_for_nullable*/ false, - ctx_->use_binary_large_variants - ); + ctx_->use_large_binary_variants); NextRowGroup(); } diff --git a/cpp/src/parquet/arrow/reader_internal.h b/cpp/src/parquet/arrow/reader_internal.h index c5ee54b7c03d4..6a904f3d45b6e 100644 --- a/cpp/src/parquet/arrow/reader_internal.h +++ b/cpp/src/parquet/arrow/reader_internal.h @@ -109,7 +109,7 @@ struct ReaderContext { FileColumnIteratorFactory iterator_factory; bool filter_leaves; std::shared_ptr> included_leaves; - bool use_binary_large_variants = false; + bool use_large_binary_variants = false; bool IncludesLeaf(int leaf_index) const { if (this->filter_leaves) { diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 4920bad21f0df..799c9a244ff43 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -473,7 +473,8 @@ ::arrow::Result> GetTypeForNode( SchemaTreeContext* ctx) { ASSIGN_OR_RAISE( std::shared_ptr storage_type, - GetArrowType(primitive_node, ctx->properties.coerce_int96_timestamp_unit(), ctx->properties.use_binary_large_variants())); + GetArrowType(primitive_node, ctx->properties.coerce_int96_timestamp_unit(), + ctx->properties.use_large_binary_variants())); if (ctx->properties.read_dictionary(column_index) && IsDictionaryReadSupported(*storage_type)) { return ::arrow::dictionary(::arrow::int32(), storage_type); diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc index a971f334dccb2..1cf0ce34706ce 100644 --- a/cpp/src/parquet/arrow/schema_internal.cc +++ b/cpp/src/parquet/arrow/schema_internal.cc @@ -111,17 +111,17 @@ Result> MakeArrowTimestamp(const LogicalType& logical } Result> FromByteArray(const LogicalType& logical_type, - bool use_binary_large_variant) { + bool use_large_binary_variants) { switch (logical_type.type()) { case LogicalType::Type::STRING: - return use_binary_large_variant ? ::arrow::large_utf8() : ::arrow::utf8(); + return use_large_binary_variants ? ::arrow::large_utf8() : ::arrow::utf8(); case LogicalType::Type::DECIMAL: return MakeArrowDecimal(logical_type); case LogicalType::Type::NONE: case LogicalType::Type::ENUM: case LogicalType::Type::JSON: case LogicalType::Type::BSON: - return use_binary_large_variant ? ::arrow::large_binary() : ::arrow::binary(); + return use_large_binary_variants ? ::arrow::large_binary() : ::arrow::binary(); default: return Status::NotImplemented("Unhandled logical logical_type ", logical_type.ToString(), " for binary array"); @@ -182,7 +182,7 @@ Result> FromInt64(const LogicalType& logical_type) { Result> GetArrowType( Type::type physical_type, const LogicalType& logical_type, int type_length, - const ::arrow::TimeUnit::type int96_arrow_time_unit, bool use_binary_large_variant) { + const ::arrow::TimeUnit::type int96_arrow_time_unit, bool use_large_binary_variants) { if (logical_type.is_invalid() || logical_type.is_null()) { return ::arrow::null(); } @@ -201,7 +201,7 @@ Result> GetArrowType( case ParquetType::DOUBLE: return ::arrow::float64(); case ParquetType::BYTE_ARRAY: - return FromByteArray(logical_type, use_binary_large_variant); + return FromByteArray(logical_type, use_large_binary_variants); case ParquetType::FIXED_LEN_BYTE_ARRAY: return FromFLBA(logical_type, type_length); default: { @@ -215,9 +215,9 @@ Result> GetArrowType( Result> GetArrowType( const schema::PrimitiveNode& primitive, const ::arrow::TimeUnit::type int96_arrow_time_unit, - bool use_binary_large_variant) { + bool use_large_binary_variants) { return GetArrowType(primitive.physical_type(), *primitive.logical_type(), - primitive.type_length(), int96_arrow_time_unit, use_binary_large_variant); + primitive.type_length(), int96_arrow_time_unit, use_large_binary_variants); } } // namespace arrow diff --git a/cpp/src/parquet/arrow/schema_internal.h b/cpp/src/parquet/arrow/schema_internal.h index 9bcebc49d3b96..67aecf6e73f1a 100644 --- a/cpp/src/parquet/arrow/schema_internal.h +++ b/cpp/src/parquet/arrow/schema_internal.h @@ -30,7 +30,7 @@ namespace arrow { using ::arrow::Result; Result> FromByteArray(const LogicalType& logical_type, - bool use_binary_large_variant); + bool use_large_binary_variants); Result> FromFLBA(const LogicalType& logical_type, int32_t physical_length); @@ -40,17 +40,17 @@ Result> FromInt64(const LogicalType& logical_ Result> GetArrowType(Type::type physical_type, const LogicalType& logical_type, int type_length, - bool use_binary_large_variant); + bool use_large_binary_variants); Result> GetArrowType( Type::type physical_type, const LogicalType& logical_type, int type_length, ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO, - bool use_binary_large_variant = false); + bool use_large_binary_variants = false); Result> GetArrowType( const schema::PrimitiveNode& primitive, ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO, - bool use_binary_large_variant = false); + bool use_large_binary_variants = false); } // namespace arrow } // namespace parquet diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 7dd31632ba14d..87d8e33d19df8 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -2093,22 +2093,22 @@ class FLBARecordReader : public TypedRecordReader, std::unique_ptr<::arrow::FixedSizeBinaryBuilder> builder_; }; -// Below concept could be used to simplify type assertion, but it seems like c++20 is not -// available +// TODO Below concept could be used to simplify type assertion, +// but it requires c++20 //template //concept ByteArrayTypeConcept = std::is_same::value || // std::is_same::value; -template +template struct IsByteArrayType : std::false_type {}; -template<> +template <> struct IsByteArrayType : std::true_type {}; -template<> +template <> struct IsByteArrayType : std::true_type {}; -template +template struct ByteArrayBuilderTypeTrait { using BuilderType = typename std::conditional::value, ::arrow::LargeBinaryBuilder, @@ -2116,15 +2116,15 @@ struct ByteArrayBuilderTypeTrait { }; template -class ChunkedRecordReader : public TypedRecordReader, - virtual public BinaryRecordReader { +class ByteArrayChunkedRecordReaderImpl : public TypedRecordReader, + virtual public BinaryRecordReader { public: using BASE = TypedRecordReader; using BASE::descr_; using BASE::ResetValues; using BuilderType = typename ByteArrayBuilderTypeTrait::BuilderType; - ChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info, + ByteArrayChunkedRecordReaderImpl(const ColumnDescriptor* descr, LevelInfo leaf_info, ::arrow::MemoryPool* pool, bool read_dense_for_nullable) : TypedRecordReader(descr, leaf_info, pool, read_dense_for_nullable) { @@ -2164,19 +2164,18 @@ class ChunkedRecordReader : public TypedRecordReader, typename EncodingTraits::Accumulator accumulator_; }; -using ByteArrayChunkedRecordReader = ChunkedRecordReader; -using LargeByteArrayChunkedRecordReader = ChunkedRecordReader; - +using ByteArrayChunkedRecordReader = ByteArrayChunkedRecordReaderImpl; +using LargeByteArrayChunkedRecordReader = ByteArrayChunkedRecordReaderImpl; template -class DictionaryRecordReaderImpl : public TypedRecordReader, - virtual public DictionaryRecordReader { +class ByteArrayDictionaryRecordReaderImpl : public TypedRecordReader, + virtual public DictionaryRecordReader { using BASE = TypedRecordReader; using BASE::current_encoding_; using BASE::ResetValues; public: - DictionaryRecordReaderImpl(const ColumnDescriptor* descr, LevelInfo leaf_info, + ByteArrayDictionaryRecordReaderImpl(const ColumnDescriptor* descr, LevelInfo leaf_info, ::arrow::MemoryPool* pool, bool read_dense_for_nullable) : TypedRecordReader(descr, leaf_info, pool, read_dense_for_nullable), builder_(pool) { @@ -2255,8 +2254,8 @@ class DictionaryRecordReaderImpl : public TypedRecordReader, std::vector> result_chunks_; }; -using ByteArrayDictionaryRecordReader = DictionaryRecordReaderImpl; -using LargeByteArrayDictionaryRecordReader = DictionaryRecordReaderImpl; +using ByteArrayDictionaryRecordReader = ByteArrayDictionaryRecordReaderImpl; +using LargeByteArrayDictionaryRecordReader = ByteArrayDictionaryRecordReaderImpl; // TODO(wesm): Implement these to some satisfaction template <> diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index b49c351b7403a..0274531b93f0d 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1926,7 +1926,7 @@ template <> void DictDecoderImpl::InsertDictionary(::arrow::ArrayBuilder* builder) { auto binary_builder = checked_cast<::arrow::LargeBinaryDictionary32Builder*>(builder); - // Make a BinaryArray referencing the internal dictionary data + // Make a LargeBinaryArray referencing the internal dictionary data auto arr = std::make_shared<::arrow::LargeBinaryArray>( dictionary_length_, byte_array_offsets_, byte_array_data_); PARQUET_THROW_NOT_OK(binary_builder->InsertMemoValues(*arr)); @@ -3507,7 +3507,7 @@ std::unique_ptr MakeEncoder(Type::type type_num, Encoding::type encodin std::unique_ptr MakeDecoder(Type::type type_num, Encoding::type encoding, const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool, bool use_binary_large_variant) { + ::arrow::MemoryPool* pool, bool use_large_binary_variants) { if (encoding == Encoding::PLAIN) { switch (type_num) { case Type::BOOLEAN: @@ -3523,7 +3523,7 @@ std::unique_ptr MakeDecoder(Type::type type_num, Encoding::type encodin case Type::DOUBLE: return std::make_unique>(descr); case Type::BYTE_ARRAY: - if (use_binary_large_variant) { + if (use_large_binary_variants) { return std::make_unique(descr); } else { return std::make_unique(descr); @@ -3578,7 +3578,7 @@ namespace detail { std::unique_ptr MakeDictDecoder(Type::type type_num, const ColumnDescriptor* descr, MemoryPool* pool, - bool use_binary_large_variant) { + bool use_large_binary_variants) { switch (type_num) { case Type::BOOLEAN: ParquetException::NYI("Dictionary encoding not implemented for boolean type"); @@ -3593,7 +3593,7 @@ std::unique_ptr MakeDictDecoder(Type::type type_num, case Type::DOUBLE: return std::make_unique>(descr, pool); case Type::BYTE_ARRAY: - if (use_binary_large_variant) { + if (use_large_binary_variants) { return std::make_unique(descr, pool); } else { return std::make_unique>(descr, pool); diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index 6ebdd59c35214..622f1d939e773 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -462,7 +462,7 @@ std::unique_ptr::Encoder> MakeTypedEncoder( PARQUET_EXPORT std::unique_ptr MakeDecoder( Type::type type_num, Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), bool use_binary_large_variant = false); + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), bool use_large_binary_variants = false); namespace detail { @@ -470,7 +470,7 @@ PARQUET_EXPORT std::unique_ptr MakeDictDecoder(Type::type type_num, const ColumnDescriptor* descr, ::arrow::MemoryPool* pool, - bool use_binary_large_variant); + bool use_large_binary_variants); } // namespace detail diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 1a56064a0864e..4e55b50375d7e 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -124,7 +124,6 @@ class PARQUET_EXPORT ReaderProperties { bool buffered_stream_enabled_ = false; bool page_checksum_verification_ = false; std::shared_ptr file_decryption_properties_; - bool use_binary_large_variants_ = false; }; ReaderProperties PARQUET_EXPORT default_reader_properties(); @@ -785,7 +784,7 @@ class PARQUET_EXPORT ArrowReaderProperties { pre_buffer_(false), cache_options_(::arrow::io::CacheOptions::Defaults()), coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO), - use_binary_large_variants_(false) {} + use_large_binary_variants_(false) {} /// \brief Set whether to use the IO thread pool to parse columns in parallel. /// @@ -853,12 +852,12 @@ class PARQUET_EXPORT ArrowReaderProperties { return coerce_int96_timestamp_unit_; } - void set_use_binary_large_variants(bool use_binary_large_variants) { - use_binary_large_variants_ = use_binary_large_variants; + void set_use_large_binary_variants(bool use_large_binary_variants) { + use_large_binary_variants_ = use_large_binary_variants; } - bool use_binary_large_variants() const { - return use_binary_large_variants_; + bool use_large_binary_variants() const { + return use_large_binary_variants_; } private: @@ -869,7 +868,7 @@ class PARQUET_EXPORT ArrowReaderProperties { ::arrow::io::IOContext io_context_; ::arrow::io::CacheOptions cache_options_; ::arrow::TimeUnit::type coerce_int96_timestamp_unit_; - bool use_binary_large_variants_; + bool use_large_binary_variants_; }; /// EXPERIMENTAL: Constructs the default ArrowReaderProperties diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index f3429746d3ca4..1665e0c0222a3 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -65,7 +65,7 @@ struct Type { BYTE_ARRAY = 6, FIXED_LEN_BYTE_ARRAY = 7, // Should always be last element. - UNDEFINED + UNDEFINED = 8 }; }; From 10890104ca87bfe1cd92ac202f2435b590933cda Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 8 Jun 2023 14:29:53 -0300 Subject: [PATCH 36/69] address a few more comments --- cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 2 +- cpp/src/parquet/encoding.cc | 1 - cpp/src/parquet/types.h | 8 +++----- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index a98e0b321be0b..f92637ac4d406 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -3871,7 +3871,7 @@ TEST(TestArrowReaderAdHoc, CorruptedSchema) { TEST(TestArrowParquet, LargeByteArray) { auto path = test::get_data_file("chunked_string_map.parquet"); TryReadDataFile(path, ::arrow::StatusCode::NotImplemented); - auto reader_properties = default_arrow_reader_properties(); + ArrowReaderProperties reader_properties; reader_properties.set_use_large_binary_variants(true); TryReadDataFileWithProperties(path, reader_properties); } diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 0274531b93f0d..b49db31d4bff1 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1277,7 +1277,6 @@ struct ArrowBinaryHelperBase { }; using ArrowBinaryHelper = ArrowBinaryHelperBase; -using ArrowLargeBinaryHelper = ArrowBinaryHelperBase; template <> inline int PlainDecoder::DecodeArrow( diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 1665e0c0222a3..f24cad9e87bcc 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -763,11 +763,9 @@ using DoubleType = PhysicalType; using ByteArrayType = PhysicalType; /* - * Parquet does not have a LARGE_BYTE_ARRAY_TYPE, but arrow does. - * It is used to store ByteArrays with length > 2^31 - 1. - * The below LargeByteArrayType is used by other classes to select the proper - * Readers/Writers/Builders/Encoders/Decoders by using the templated EncodingTraits. - * Since there is not a parquet equivalent, a struct has to be used as a workaround + * Parquet uses ByteArrayType for variable length strings and binaries and their lengths + * will not exceed 2^31 - 1. However, arrow supports StringType/BinaryType and their + * large variants (i.e. LargeStringType and LargeBinaryType). * */ struct LargeByteArrayType : public ByteArrayType {}; From a6c42ee122e0582f5f3f6b6e6ccb07703a031107 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 8 Jun 2023 14:57:12 -0300 Subject: [PATCH 37/69] remove arrow-type include & move binarylimit trait --- cpp/src/parquet/encoding.cc | 20 ++++++++++++++++++-- cpp/src/parquet/encoding.h | 6 ------ 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index b49db31d4bff1..3a1c4da937a24 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1238,20 +1238,36 @@ int PlainBooleanDecoder::Decode(bool* buffer, int max_values) { return max_values; } +template +struct ArrowBinaryHelperTraits; + +template <> +struct ArrowBinaryHelperTraits +{ + static constexpr auto memory_limit = ::arrow::kBinaryMemoryLimit; +}; + +template <> +struct ArrowBinaryHelperTraits +{ + static constexpr auto memory_limit = ::arrow::kLargeBinaryMemoryLimit; +}; + template struct ArrowBinaryHelperBase { + explicit ArrowBinaryHelperBase(typename EncodingTraits::Accumulator* out) { this->out = out; this->builder = out->builder.get(); this->chunk_space_remaining = - EncodingTraits::memory_limit - this->builder->value_data_length(); + ArrowBinaryHelperTraits::memory_limit - this->builder->value_data_length(); } Status PushChunk() { std::shared_ptr<::arrow::Array> result; RETURN_NOT_OK(builder->Finish(&result)); out->chunks.push_back(result); - chunk_space_remaining = ::arrow::kBinaryMemoryLimit; + chunk_space_remaining = ArrowBinaryHelperTraits::memory_limit; return Status::OK(); } diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index 622f1d939e773..af5425fdc54b3 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -24,7 +24,6 @@ #include "arrow/util/spaced.h" -#include "arrow/type.h" #include "parquet/exception.h" #include "parquet/platform.h" #include "parquet/types.h" @@ -153,9 +152,6 @@ struct EncodingTraits { }; using ArrowType = ::arrow::BinaryType; using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>; - - - static constexpr auto memory_limit = ::arrow::kBinaryMemoryLimit; }; template <> @@ -172,8 +168,6 @@ struct EncodingTraits { }; using ArrowType = ::arrow::LargeBinaryType; using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::LargeBinaryType>; - - static constexpr auto memory_limit = ::arrow::kLargeBinaryMemoryLimit; }; template <> From 15be2a2ce77e16d047c845b3ec3585bebe6850e0 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 8 Jun 2023 15:22:14 -0300 Subject: [PATCH 38/69] consolidate setdict --- cpp/src/parquet/encoding.cc | 84 +++++++++++++++---------------------- 1 file changed, 34 insertions(+), 50 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 3a1c4da937a24..f4c35f424c0a9 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1524,6 +1524,38 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { // Perform type-specific initiatialization void SetDict(TypedDecoder* dictionary) override; + template + || std::is_same_v>> + void SetByteArrayDict(TypedDecoder* dictionary) + { + DecodeDict(dictionary); + + auto dict_values = reinterpret_cast(dictionary_->mutable_data()); + + int total_size = 0; + for (int i = 0; i < dictionary_length_; ++i) { + total_size += dict_values[i].len; + } + PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size, + /*shrink_to_fit=*/false)); + PARQUET_THROW_NOT_OK( + byte_array_offsets_->Resize((dictionary_length_ + 1) * sizeof(int32_t), + /*shrink_to_fit=*/false)); + + int32_t offset = 0; + uint8_t* bytes_data = byte_array_data_->mutable_data(); + int32_t* bytes_offsets = + reinterpret_cast(byte_array_offsets_->mutable_data()); + for (int i = 0; i < dictionary_length_; ++i) { + memcpy(bytes_data + offset, dict_values[i].ptr, dict_values[i].len); + bytes_offsets[i] = offset; + dict_values[i].ptr = bytes_data + offset; + offset += dict_values[i].len; + } + bytes_offsets[dictionary_length_] = offset; + } + void SetData(int num_values, const uint8_t* data, int len) override { num_values_ = num_values; if (len == 0) { @@ -1690,60 +1722,12 @@ void DictDecoderImpl::SetDict(TypedDecoder* dictionary template <> void DictDecoderImpl::SetDict(TypedDecoder* dictionary) { - DecodeDict(dictionary); - - auto dict_values = reinterpret_cast(dictionary_->mutable_data()); - - int total_size = 0; - for (int i = 0; i < dictionary_length_; ++i) { - total_size += dict_values[i].len; - } - PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size, - /*shrink_to_fit=*/false)); - PARQUET_THROW_NOT_OK( - byte_array_offsets_->Resize((dictionary_length_ + 1) * sizeof(int32_t), - /*shrink_to_fit=*/false)); - - int32_t offset = 0; - uint8_t* bytes_data = byte_array_data_->mutable_data(); - int32_t* bytes_offsets = - reinterpret_cast(byte_array_offsets_->mutable_data()); - for (int i = 0; i < dictionary_length_; ++i) { - memcpy(bytes_data + offset, dict_values[i].ptr, dict_values[i].len); - bytes_offsets[i] = offset; - dict_values[i].ptr = bytes_data + offset; - offset += dict_values[i].len; - } - bytes_offsets[dictionary_length_] = offset; + SetByteArrayDict(dictionary); } template <> void DictDecoderImpl::SetDict(TypedDecoder* dictionary) { - DecodeDict(dictionary); - - auto dict_values = reinterpret_cast(dictionary_->mutable_data()); - - int total_size = 0; - for (int i = 0; i < dictionary_length_; ++i) { - total_size += dict_values[i].len; - } - PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size, - /*shrink_to_fit=*/false)); - PARQUET_THROW_NOT_OK( - byte_array_offsets_->Resize((dictionary_length_ + 1) * sizeof(int32_t), - /*shrink_to_fit=*/false)); - - int32_t offset = 0; - uint8_t* bytes_data = byte_array_data_->mutable_data(); - int32_t* bytes_offsets = - reinterpret_cast(byte_array_offsets_->mutable_data()); - for (int i = 0; i < dictionary_length_; ++i) { - memcpy(bytes_data + offset, dict_values[i].ptr, dict_values[i].len); - bytes_offsets[i] = offset; - dict_values[i].ptr = bytes_data + offset; - offset += dict_values[i].len; - } - bytes_offsets[dictionary_length_] = offset; + SetByteArrayDict(dictionary); } template <> From 8d5ba3df29f2f909e74d92c180246b36e0691d90 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 8 Jun 2023 15:32:30 -0300 Subject: [PATCH 39/69] apply clangformat --- .../parquet/arrow/arrow_reader_writer_test.cc | 10 +- cpp/src/parquet/arrow/reader.cc | 7 +- cpp/src/parquet/arrow/schema_internal.cc | 6 +- cpp/src/parquet/column_reader.cc | 56 +-- cpp/src/parquet/encoding.cc | 397 +++++++++--------- cpp/src/parquet/encoding.h | 9 +- cpp/src/parquet/properties.h | 4 +- cpp/src/parquet/types.h | 3 +- 8 files changed, 245 insertions(+), 247 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index f92637ac4d406..7fa6d23414756 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -3834,14 +3834,14 @@ TEST(TestImpalaConversion, ArrowTimestampToImpalaTimestamp) { ASSERT_EQ(expected, calculated); } -void TryReadDataFileWithProperties(const std::string& path, - const ArrowReaderProperties& properties, - ::arrow::StatusCode expected_code = ::arrow::StatusCode::OK) { +void TryReadDataFileWithProperties( + const std::string& path, const ArrowReaderProperties& properties, + ::arrow::StatusCode expected_code = ::arrow::StatusCode::OK) { auto pool = ::arrow::default_memory_pool(); std::unique_ptr arrow_reader; - Status s = - FileReader::Make(pool, ParquetFileReader::OpenFile(path, false), properties, &arrow_reader); + Status s = FileReader::Make(pool, ParquetFileReader::OpenFile(path, false), properties, + &arrow_reader); if (s.ok()) { std::shared_ptr<::arrow::Table> table; s = arrow_reader->ReadTable(&table); diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index b163eaa4850c0..e8a3f79aa0e6d 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -463,11 +463,8 @@ class LeafReader : public ColumnReaderImpl { input_(std::move(input)), descr_(input_->descr()) { record_reader_ = RecordReader::Make( - descr_, - leaf_info, - ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY, - /*read_dense_for_nullable*/ false, - ctx_->use_large_binary_variants); + descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY, + /*read_dense_for_nullable*/ false, ctx_->use_large_binary_variants); NextRowGroup(); } diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc index 1cf0ce34706ce..b399b1f83dbdd 100644 --- a/cpp/src/parquet/arrow/schema_internal.cc +++ b/cpp/src/parquet/arrow/schema_internal.cc @@ -214,10 +214,10 @@ Result> GetArrowType( Result> GetArrowType( const schema::PrimitiveNode& primitive, - const ::arrow::TimeUnit::type int96_arrow_time_unit, - bool use_large_binary_variants) { + const ::arrow::TimeUnit::type int96_arrow_time_unit, bool use_large_binary_variants) { return GetArrowType(primitive.physical_type(), *primitive.logical_type(), - primitive.type_length(), int96_arrow_time_unit, use_large_binary_variants); + primitive.type_length(), int96_arrow_time_unit, + use_large_binary_variants); } } // namespace arrow diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 87d8e33d19df8..6da925bc71ba0 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -2095,8 +2095,8 @@ class FLBARecordReader : public TypedRecordReader, // TODO Below concept could be used to simplify type assertion, // but it requires c++20 -//template -//concept ByteArrayTypeConcept = std::is_same::value || +// template +// concept ByteArrayTypeConcept = std::is_same::value || // std::is_same::value; template @@ -2110,12 +2110,13 @@ struct IsByteArrayType : std::true_type {}; template struct ByteArrayBuilderTypeTrait { - using BuilderType = typename std::conditional::value, - ::arrow::LargeBinaryBuilder, - ::arrow::BinaryBuilder>::type; + using BuilderType = + typename std::conditional::value, + ::arrow::LargeBinaryBuilder, + ::arrow::BinaryBuilder>::type; }; -template +template class ByteArrayChunkedRecordReaderImpl : public TypedRecordReader, virtual public BinaryRecordReader { public: @@ -2125,9 +2126,9 @@ class ByteArrayChunkedRecordReaderImpl : public TypedRecordReader, using BuilderType = typename ByteArrayBuilderTypeTrait::BuilderType; ByteArrayChunkedRecordReaderImpl(const ColumnDescriptor* descr, LevelInfo leaf_info, - ::arrow::MemoryPool* pool, bool read_dense_for_nullable) - : TypedRecordReader(descr, leaf_info, pool, - read_dense_for_nullable) { + ::arrow::MemoryPool* pool, + bool read_dense_for_nullable) + : TypedRecordReader(descr, leaf_info, pool, read_dense_for_nullable) { static_assert(IsByteArrayType::value, "Invalid ByteArrayType"); ARROW_DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY); accumulator_.builder = std::make_unique(pool); @@ -2165,7 +2166,8 @@ class ByteArrayChunkedRecordReaderImpl : public TypedRecordReader, }; using ByteArrayChunkedRecordReader = ByteArrayChunkedRecordReaderImpl; -using LargeByteArrayChunkedRecordReader = ByteArrayChunkedRecordReaderImpl; +using LargeByteArrayChunkedRecordReader = + ByteArrayChunkedRecordReaderImpl; template class ByteArrayDictionaryRecordReaderImpl : public TypedRecordReader, @@ -2176,7 +2178,8 @@ class ByteArrayDictionaryRecordReaderImpl : public TypedRecordReader, public: ByteArrayDictionaryRecordReaderImpl(const ColumnDescriptor* descr, LevelInfo leaf_info, - ::arrow::MemoryPool* pool, bool read_dense_for_nullable) + ::arrow::MemoryPool* pool, + bool read_dense_for_nullable) : TypedRecordReader(descr, leaf_info, pool, read_dense_for_nullable), builder_(pool) { this->read_dictionary_ = true; @@ -2254,8 +2257,10 @@ class ByteArrayDictionaryRecordReaderImpl : public TypedRecordReader, std::vector> result_chunks_; }; -using ByteArrayDictionaryRecordReader = ByteArrayDictionaryRecordReaderImpl; -using LargeByteArrayDictionaryRecordReader = ByteArrayDictionaryRecordReaderImpl; +using ByteArrayDictionaryRecordReader = + ByteArrayDictionaryRecordReaderImpl; +using LargeByteArrayDictionaryRecordReader = + ByteArrayDictionaryRecordReaderImpl; // TODO(wesm): Implement these to some satisfaction template <> @@ -2284,17 +2289,15 @@ std::shared_ptr MakeByteArrayRecordReader(const ColumnDescriptor* } } -std::shared_ptr MakeLargeByteArrayRecordReader(const ColumnDescriptor* descr, - LevelInfo leaf_info, - ::arrow::MemoryPool* pool, - bool read_dictionary, - bool read_dense_for_nullable) { +std::shared_ptr MakeLargeByteArrayRecordReader( + const ColumnDescriptor* descr, LevelInfo leaf_info, ::arrow::MemoryPool* pool, + bool read_dictionary, bool read_dense_for_nullable) { if (read_dictionary) { - return std::make_shared(descr, leaf_info, pool, - read_dense_for_nullable); - } else { - return std::make_shared( + return std::make_shared( descr, leaf_info, pool, read_dense_for_nullable); + } else { + return std::make_shared(descr, leaf_info, pool, + read_dense_for_nullable); } } @@ -2325,10 +2328,11 @@ std::shared_ptr RecordReader::Make(const ColumnDescriptor* descr, return std::make_shared>(descr, leaf_info, pool, read_dense_for_nullable); case Type::BYTE_ARRAY: { - return use_binary_string_large_variants ? MakeLargeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary, - read_dense_for_nullable) - : MakeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary, - read_dense_for_nullable); + return use_binary_string_large_variants + ? MakeLargeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary, + read_dense_for_nullable) + : MakeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary, + read_dense_for_nullable); } case Type::FIXED_LEN_BYTE_ARRAY: return std::make_shared(descr, leaf_info, pool, diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index f4c35f424c0a9..b97950597e5f0 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1242,20 +1242,17 @@ template struct ArrowBinaryHelperTraits; template <> -struct ArrowBinaryHelperTraits -{ +struct ArrowBinaryHelperTraits { static constexpr auto memory_limit = ::arrow::kBinaryMemoryLimit; }; template <> -struct ArrowBinaryHelperTraits -{ +struct ArrowBinaryHelperTraits { static constexpr auto memory_limit = ::arrow::kLargeBinaryMemoryLimit; }; template struct ArrowBinaryHelperBase { - explicit ArrowBinaryHelperBase(typename EncodingTraits::Accumulator* out) { this->out = out; this->builder = out->builder.get(); @@ -1375,10 +1372,10 @@ class PlainByteArrayDecoderBase : public PlainDecoder, virtual public TypedDecoder { public: using Base = PlainDecoder; - using Base::len_; using Base::data_; - using Base::num_values_; using Base::DecodeSpaced; + using Base::len_; + using Base::num_values_; using Base::PlainDecoder; // ---------------------------------------------------------------------- @@ -1525,10 +1522,9 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { void SetDict(TypedDecoder* dictionary) override; template - || std::is_same_v>> - void SetByteArrayDict(TypedDecoder* dictionary) - { + typename = std::enable_if_t || + std::is_same_v>> + void SetByteArrayDict(TypedDecoder* dictionary) { DecodeDict(dictionary); auto dict_values = reinterpret_cast(dictionary_->mutable_data()); @@ -1726,7 +1722,8 @@ void DictDecoderImpl::SetDict(TypedDecoder* dictio } template <> -void DictDecoderImpl::SetDict(TypedDecoder* dictionary) { +void DictDecoderImpl::SetDict( + TypedDecoder* dictionary) { SetByteArrayDict(dictionary); } @@ -1922,7 +1919,8 @@ void DictDecoderImpl::InsertDictionary(::arrow::ArrayBuilder* bui } template <> -void DictDecoderImpl::InsertDictionary(::arrow::ArrayBuilder* builder) { +void DictDecoderImpl::InsertDictionary( + ::arrow::ArrayBuilder* builder) { auto binary_builder = checked_cast<::arrow::LargeBinaryDictionary32Builder*>(builder); // Make a LargeBinaryArray referencing the internal dictionary data @@ -1934,220 +1932,219 @@ void DictDecoderImpl::InsertDictionary(::arrow::ArrayBuilder template class DictByteArrayDecoderImpl : public DictDecoderImpl, virtual public TypedDecoder { + public: + using BASE = DictDecoderImpl; + using BASE::DictDecoderImpl; + using BASE::dictionary_; + using BASE::idx_decoder_; + using BASE::IndexInBounds; - public: - using BASE = DictDecoderImpl; - using BASE::DictDecoderImpl; - using BASE::dictionary_; - using BASE::idx_decoder_; - using BASE::IndexInBounds; - - int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, - typename EncodingTraits::DictAccumulator* builder) override { - int result = 0; - if (null_count == 0) { - PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result)); - } else { - PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, - valid_bits_offset, builder, &result)); - } - return result; + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::DictAccumulator* builder) override { + int result = 0; + if (null_count == 0) { + PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result)); + } else { + PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, + valid_bits_offset, builder, &result)); } + return result; + } - int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, - typename EncodingTraits::Accumulator* out) override { - int result = 0; - if (null_count == 0) { - PARQUET_THROW_NOT_OK(DecodeArrowDenseNonNull(num_values, out, &result)); - } else { - PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits, - valid_bits_offset, out, &result)); + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::Accumulator* out) override { + int result = 0; + if (null_count == 0) { + PARQUET_THROW_NOT_OK(DecodeArrowDenseNonNull(num_values, out, &result)); + } else { + PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits, + valid_bits_offset, out, &result)); + } + return result; + } + + private: + Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::Accumulator* out, + int* out_num_values) { + constexpr int32_t kBufferSize = 1024; + int32_t indices[kBufferSize]; + + ArrowBinaryHelperBase helper(out); + + auto dict_values = reinterpret_cast(dictionary_->data()); + int values_decoded = 0; + int num_indices = 0; + int pos_indices = 0; + + auto visit_valid = [&](int64_t position) -> Status { + if (num_indices == pos_indices) { + // Refill indices buffer + const auto batch_size = + std::min(kBufferSize, num_values - null_count - values_decoded); + num_indices = idx_decoder_.GetBatch(indices, batch_size); + if (ARROW_PREDICT_FALSE(num_indices < 1)) { + return Status::Invalid("Invalid number of indices: ", num_indices); + } + pos_indices = 0; } - return result; - } - - private: - Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, - typename EncodingTraits::Accumulator* out, - int* out_num_values) { - constexpr int32_t kBufferSize = 1024; - int32_t indices[kBufferSize]; - - ArrowBinaryHelperBase helper(out); - - auto dict_values = reinterpret_cast(dictionary_->data()); - int values_decoded = 0; - int num_indices = 0; - int pos_indices = 0; - - auto visit_valid = [&](int64_t position) -> Status { - if (num_indices == pos_indices) { - // Refill indices buffer - const auto batch_size = - std::min(kBufferSize, num_values - null_count - values_decoded); - num_indices = idx_decoder_.GetBatch(indices, batch_size); - if (ARROW_PREDICT_FALSE(num_indices < 1)) { - return Status::Invalid("Invalid number of indices: ", num_indices); - } - pos_indices = 0; + const auto index = indices[pos_indices++]; + RETURN_NOT_OK(IndexInBounds(index)); + const auto& val = dict_values[index]; + if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) { + RETURN_NOT_OK(helper.PushChunk()); + } + RETURN_NOT_OK(helper.Append(val.ptr, static_cast(val.len))); + ++values_decoded; + return Status::OK(); + }; + + auto visit_null = [&]() -> Status { + RETURN_NOT_OK(helper.AppendNull()); + return Status::OK(); + }; + + ::arrow::internal::BitBlockCounter bit_blocks(valid_bits, valid_bits_offset, + num_values); + int64_t position = 0; + while (position < num_values) { + const auto block = bit_blocks.NextWord(); + if (block.AllSet()) { + for (int64_t i = 0; i < block.length; ++i, ++position) { + ARROW_RETURN_NOT_OK(visit_valid(position)); } - const auto index = indices[pos_indices++]; - RETURN_NOT_OK(IndexInBounds(index)); - const auto& val = dict_values[index]; - if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) { - RETURN_NOT_OK(helper.PushChunk()); + } else if (block.NoneSet()) { + for (int64_t i = 0; i < block.length; ++i, ++position) { + ARROW_RETURN_NOT_OK(visit_null()); } - RETURN_NOT_OK(helper.Append(val.ptr, static_cast(val.len))); - ++values_decoded; - return Status::OK(); - }; - - auto visit_null = [&]() -> Status { - RETURN_NOT_OK(helper.AppendNull()); - return Status::OK(); - }; - - ::arrow::internal::BitBlockCounter bit_blocks(valid_bits, valid_bits_offset, - num_values); - int64_t position = 0; - while (position < num_values) { - const auto block = bit_blocks.NextWord(); - if (block.AllSet()) { - for (int64_t i = 0; i < block.length; ++i, ++position) { + } else { + for (int64_t i = 0; i < block.length; ++i, ++position) { + if (bit_util::GetBit(valid_bits, valid_bits_offset + position)) { ARROW_RETURN_NOT_OK(visit_valid(position)); - } - } else if (block.NoneSet()) { - for (int64_t i = 0; i < block.length; ++i, ++position) { + } else { ARROW_RETURN_NOT_OK(visit_null()); } - } else { - for (int64_t i = 0; i < block.length; ++i, ++position) { - if (bit_util::GetBit(valid_bits, valid_bits_offset + position)) { - ARROW_RETURN_NOT_OK(visit_valid(position)); - } else { - ARROW_RETURN_NOT_OK(visit_null()); - } - } } } - - *out_num_values = values_decoded; - return Status::OK(); } - Status DecodeArrowDenseNonNull(int num_values, - typename EncodingTraits::Accumulator* out, - int* out_num_values) { - constexpr int32_t kBufferSize = 2048; - int32_t indices[kBufferSize]; - int values_decoded = 0; + *out_num_values = values_decoded; + return Status::OK(); + } - ArrowBinaryHelperBase helper(out); - auto dict_values = reinterpret_cast(dictionary_->data()); + Status DecodeArrowDenseNonNull(int num_values, + typename EncodingTraits::Accumulator* out, + int* out_num_values) { + constexpr int32_t kBufferSize = 2048; + int32_t indices[kBufferSize]; + int values_decoded = 0; - while (values_decoded < num_values) { - int32_t batch_size = std::min(kBufferSize, num_values - values_decoded); - int num_indices = idx_decoder_.GetBatch(indices, batch_size); - if (num_indices == 0) ParquetException::EofException(); - for (int i = 0; i < num_indices; ++i) { - auto idx = indices[i]; - RETURN_NOT_OK(IndexInBounds(idx)); - const auto& val = dict_values[idx]; - if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) { - RETURN_NOT_OK(helper.PushChunk()); - } - RETURN_NOT_OK(helper.Append(val.ptr, static_cast(val.len))); + ArrowBinaryHelperBase helper(out); + auto dict_values = reinterpret_cast(dictionary_->data()); + + while (values_decoded < num_values) { + int32_t batch_size = std::min(kBufferSize, num_values - values_decoded); + int num_indices = idx_decoder_.GetBatch(indices, batch_size); + if (num_indices == 0) ParquetException::EofException(); + for (int i = 0; i < num_indices; ++i) { + auto idx = indices[i]; + RETURN_NOT_OK(IndexInBounds(idx)); + const auto& val = dict_values[idx]; + if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) { + RETURN_NOT_OK(helper.PushChunk()); } - values_decoded += num_indices; + RETURN_NOT_OK(helper.Append(val.ptr, static_cast(val.len))); } - *out_num_values = values_decoded; - return Status::OK(); + values_decoded += num_indices; } + *out_num_values = values_decoded; + return Status::OK(); + } - template - Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, BuilderType* builder, - int* out_num_values) { - constexpr int32_t kBufferSize = 1024; - int32_t indices[kBufferSize]; - - RETURN_NOT_OK(builder->Reserve(num_values)); - ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); - - auto dict_values = reinterpret_cast(dictionary_->data()); - - int values_decoded = 0; - int num_appended = 0; - while (num_appended < num_values) { - bool is_valid = bit_reader.IsSet(); - bit_reader.Next(); - - if (is_valid) { - int32_t batch_size = - std::min(kBufferSize, num_values - num_appended - null_count); - int num_indices = idx_decoder_.GetBatch(indices, batch_size); - - int i = 0; - while (true) { - // Consume all indices - if (is_valid) { - auto idx = indices[i]; - RETURN_NOT_OK(IndexInBounds(idx)); - const auto& val = dict_values[idx]; - RETURN_NOT_OK(builder->Append(val.ptr, val.len)); - ++i; - ++values_decoded; - } else { - RETURN_NOT_OK(builder->AppendNull()); - --null_count; - } - ++num_appended; - if (i == num_indices) { - // Do not advance the bit_reader if we have fulfilled the decode - // request - break; - } - is_valid = bit_reader.IsSet(); - bit_reader.Next(); + template + Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, BuilderType* builder, + int* out_num_values) { + constexpr int32_t kBufferSize = 1024; + int32_t indices[kBufferSize]; + + RETURN_NOT_OK(builder->Reserve(num_values)); + ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); + + auto dict_values = reinterpret_cast(dictionary_->data()); + + int values_decoded = 0; + int num_appended = 0; + while (num_appended < num_values) { + bool is_valid = bit_reader.IsSet(); + bit_reader.Next(); + + if (is_valid) { + int32_t batch_size = + std::min(kBufferSize, num_values - num_appended - null_count); + int num_indices = idx_decoder_.GetBatch(indices, batch_size); + + int i = 0; + while (true) { + // Consume all indices + if (is_valid) { + auto idx = indices[i]; + RETURN_NOT_OK(IndexInBounds(idx)); + const auto& val = dict_values[idx]; + RETURN_NOT_OK(builder->Append(val.ptr, val.len)); + ++i; + ++values_decoded; + } else { + RETURN_NOT_OK(builder->AppendNull()); + --null_count; } - } else { - RETURN_NOT_OK(builder->AppendNull()); - --null_count; ++num_appended; + if (i == num_indices) { + // Do not advance the bit_reader if we have fulfilled the decode + // request + break; + } + is_valid = bit_reader.IsSet(); + bit_reader.Next(); } + } else { + RETURN_NOT_OK(builder->AppendNull()); + --null_count; + ++num_appended; } - *out_num_values = values_decoded; - return Status::OK(); } + *out_num_values = values_decoded; + return Status::OK(); + } - template - Status DecodeArrowNonNull(int num_values, BuilderType* builder, int* out_num_values) { - constexpr int32_t kBufferSize = 2048; - int32_t indices[kBufferSize]; + template + Status DecodeArrowNonNull(int num_values, BuilderType* builder, int* out_num_values) { + constexpr int32_t kBufferSize = 2048; + int32_t indices[kBufferSize]; - RETURN_NOT_OK(builder->Reserve(num_values)); + RETURN_NOT_OK(builder->Reserve(num_values)); - auto dict_values = reinterpret_cast(dictionary_->data()); + auto dict_values = reinterpret_cast(dictionary_->data()); - int values_decoded = 0; - while (values_decoded < num_values) { - int32_t batch_size = std::min(kBufferSize, num_values - values_decoded); - int num_indices = idx_decoder_.GetBatch(indices, batch_size); - if (num_indices == 0) ParquetException::EofException(); - for (int i = 0; i < num_indices; ++i) { - auto idx = indices[i]; - RETURN_NOT_OK(IndexInBounds(idx)); - const auto& val = dict_values[idx]; - RETURN_NOT_OK(builder->Append(val.ptr, val.len)); - } - values_decoded += num_indices; + int values_decoded = 0; + while (values_decoded < num_values) { + int32_t batch_size = std::min(kBufferSize, num_values - values_decoded); + int num_indices = idx_decoder_.GetBatch(indices, batch_size); + if (num_indices == 0) ParquetException::EofException(); + for (int i = 0; i < num_indices; ++i) { + auto idx = indices[i]; + RETURN_NOT_OK(IndexInBounds(idx)); + const auto& val = dict_values[idx]; + RETURN_NOT_OK(builder->Append(val.ptr, val.len)); } - *out_num_values = values_decoded; - return Status::OK(); + values_decoded += num_indices; } + *out_num_values = values_decoded; + return Status::OK(); + } }; using DictLargeByteArrayDecoderImpl = DictByteArrayDecoderImpl; @@ -3506,7 +3503,8 @@ std::unique_ptr MakeEncoder(Type::type type_num, Encoding::type encodin std::unique_ptr MakeDecoder(Type::type type_num, Encoding::type encoding, const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool, bool use_large_binary_variants) { + ::arrow::MemoryPool* pool, + bool use_large_binary_variants) { if (encoding == Encoding::PLAIN) { switch (type_num) { case Type::BOOLEAN: @@ -3575,8 +3573,7 @@ std::unique_ptr MakeDecoder(Type::type type_num, Encoding::type encodin namespace detail { std::unique_ptr MakeDictDecoder(Type::type type_num, - const ColumnDescriptor* descr, - MemoryPool* pool, + const ColumnDescriptor* descr, MemoryPool* pool, bool use_large_binary_variants) { switch (type_num) { case Type::BOOLEAN: diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index af5425fdc54b3..f61c5e5b642d2 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -456,7 +456,8 @@ std::unique_ptr::Encoder> MakeTypedEncoder( PARQUET_EXPORT std::unique_ptr MakeDecoder( Type::type type_num, Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), bool use_large_binary_variants = false); + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), + bool use_large_binary_variants = false); namespace detail { @@ -473,7 +474,8 @@ std::unique_ptr> MakeDictDecoder( const ColumnDescriptor* descr = NULLPTR, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { using OutType = DictDecoder; - auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool, std::is_same_v); + auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool, + std::is_same_v); return std::unique_ptr(dynamic_cast(decoder.release())); } @@ -483,7 +485,8 @@ std::unique_ptr::Decoder> MakeTypedDecoder( ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { using OutType = typename EncodingTraits::Decoder; - std::unique_ptr base = MakeDecoder(DType::type_num, encoding, descr, pool, std::is_same_v); + std::unique_ptr base = MakeDecoder(DType::type_num, encoding, descr, pool, + std::is_same_v); return std::unique_ptr(dynamic_cast(base.release())); } diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 4e55b50375d7e..e59b2e4e84254 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -856,9 +856,7 @@ class PARQUET_EXPORT ArrowReaderProperties { use_large_binary_variants_ = use_large_binary_variants; } - bool use_large_binary_variants() const { - return use_large_binary_variants_; - } + bool use_large_binary_variants() const { return use_large_binary_variants_; } private: bool use_threads_; diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index f24cad9e87bcc..73a25cbf1e131 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -767,8 +767,7 @@ using ByteArrayType = PhysicalType; * will not exceed 2^31 - 1. However, arrow supports StringType/BinaryType and their * large variants (i.e. LargeStringType and LargeBinaryType). * */ -struct LargeByteArrayType : public ByteArrayType -{}; +struct LargeByteArrayType : public ByteArrayType {}; using FLBAType = PhysicalType; From fd8f979bb006e2e47da03d92062031090a892903 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 9 Jun 2023 09:19:30 -0300 Subject: [PATCH 40/69] removed todos --- cpp/src/parquet/page_index.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index f3bca027dac5b..d29cc33eb5afd 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -853,7 +853,6 @@ std::unique_ptr ColumnIndex::Make(const ColumnDescriptor& descr, return std::make_unique>(descr, column_index); case Type::BYTE_ARRAY: return std::make_unique>(descr, column_index); - // TODO AP FIX ARTHUR PASSOS case Type::FIXED_LEN_BYTE_ARRAY: return std::make_unique>(descr, column_index); case Type::UNDEFINED: @@ -898,7 +897,6 @@ std::unique_ptr ColumnIndexBuilder::Make( return std::make_unique>(descr); case Type::BYTE_ARRAY: return std::make_unique>(descr); - // TODO AP FIX ARTHUR PASSOS case Type::FIXED_LEN_BYTE_ARRAY: return std::make_unique>(descr); case Type::UNDEFINED: From a5736d5ca39791db97b44f966fc1ff9661ea2647 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 9 Jun 2023 09:20:33 -0300 Subject: [PATCH 41/69] a bit more renaming --- cpp/src/parquet/column_reader.cc | 4 ++-- cpp/src/parquet/column_reader.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 6da925bc71ba0..0bc7329dbd456 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -2307,7 +2307,7 @@ std::shared_ptr RecordReader::Make(const ColumnDescriptor* descr, LevelInfo leaf_info, MemoryPool* pool, bool read_dictionary, bool read_dense_for_nullable, - bool use_binary_string_large_variants) { + bool use_large_binary_variants) { switch (descr->physical_type()) { case Type::BOOLEAN: return std::make_shared>(descr, leaf_info, pool, @@ -2328,7 +2328,7 @@ std::shared_ptr RecordReader::Make(const ColumnDescriptor* descr, return std::make_shared>(descr, leaf_info, pool, read_dense_for_nullable); case Type::BYTE_ARRAY: { - return use_binary_string_large_variants + return use_large_binary_variants ? MakeLargeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary, read_dense_for_nullable) : MakeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary, diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 471117c1f13e6..7e938310a9839 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -322,7 +322,7 @@ class PARQUET_EXPORT RecordReader { const ColumnDescriptor* descr, LevelInfo leaf_info, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), bool read_dictionary = false, bool read_dense_for_nullable = false, - bool use_binary_string_large_variants = false); + bool use_large_binary_variants = false); virtual ~RecordReader() = default; From b4ecd0d601cd3a1ef09fb5e6ccad8b07208665a8 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 9 Jun 2023 09:45:31 -0300 Subject: [PATCH 42/69] address one mor comment --- cpp/src/parquet/arrow/reader.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index e8a3f79aa0e6d..c74a93f419e5c 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -1220,6 +1220,7 @@ Status FileReaderImpl::GetColumn(int i, FileColumnIteratorFactory iterator_facto ctx->pool = pool_; ctx->iterator_factory = iterator_factory; ctx->filter_leaves = false; + ctx->use_large_binary_variants = reader_properties_.use_large_binary_variants(); std::unique_ptr result; RETURN_NOT_OK(GetReader(manifest_.schema_fields[i], ctx, &result)); *out = std::move(result); From 9e9dff9b416a6b03c687b161cea5e7711a9382a1 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 9 Jun 2023 13:19:20 -0300 Subject: [PATCH 43/69] add overflow check in dict --- cpp/src/parquet/encoding.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index b97950597e5f0..a5b99c256829f 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1531,7 +1531,9 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { int total_size = 0; for (int i = 0; i < dictionary_length_; ++i) { - total_size += dict_values[i].len; + if (AddWithOverflow(total_size, dict_values[i].len, &total_size)) { + throw ParquetException("String/Binary Length to large"); + } } PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size, /*shrink_to_fit=*/false)); From ae1db20cda7c5515d2ab21782838102f162465e0 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 12 Jun 2023 08:47:54 -0300 Subject: [PATCH 44/69] address a few comments --- cpp/src/parquet/encoding.cc | 2 +- cpp/src/parquet/types.h | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index a5b99c256829f..9b0ccf14d255d 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1532,7 +1532,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { int total_size = 0; for (int i = 0; i < dictionary_length_; ++i) { if (AddWithOverflow(total_size, dict_values[i].len, &total_size)) { - throw ParquetException("String/Binary Length to large"); + throw ParquetException("String/Binary length to large"); } } PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size, diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 73a25cbf1e131..11eb0e703b7a7 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -763,9 +763,13 @@ using DoubleType = PhysicalType; using ByteArrayType = PhysicalType; /* - * Parquet uses ByteArrayType for variable length strings and binaries and their lengths - * will not exceed 2^31 - 1. However, arrow supports StringType/BinaryType and their - * large variants (i.e. LargeStringType and LargeBinaryType). + * Parquet has defined ByteArrayType for variable length string and binary values with a + * maximum length of 2^31 - 1. By default, arrow StringType and BinaryType are used to + * map parquet ByteArrayType. However, arrow StringArray/BinaryArray uses int32_t to + * store the offset of each string/binary value in a concatenated buffer which may + * overflow (though unlikely in most cases). As arrow has defined LargeStringType and + * LargeBinaryType which use int64_t as the offset type, we define LargeByteArrayType + * below to indicate parquet reader/writer to use those large variants from arrow. * */ struct LargeByteArrayType : public ByteArrayType {}; From 09a9eaf3c9d946b3fcfb0ce97947af452396437e Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 14 Jun 2023 09:21:37 -0300 Subject: [PATCH 45/69] use int32_t explicitly --- cpp/src/parquet/encoding.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 9b0ccf14d255d..20fffd2fa4182 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1529,7 +1529,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { auto dict_values = reinterpret_cast(dictionary_->mutable_data()); - int total_size = 0; + int32_t total_size = 0; for (int i = 0; i < dictionary_length_; ++i) { if (AddWithOverflow(total_size, dict_values[i].len, &total_size)) { throw ParquetException("String/Binary length to large"); From 1664983da76efa09ed8983ce0a6ab3e34b00f15f Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 14 Jun 2023 14:05:00 -0300 Subject: [PATCH 46/69] use template directly --- cpp/src/parquet/encoding.cc | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 20fffd2fa4182..bf6e00763f699 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -2149,8 +2149,6 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, } }; -using DictLargeByteArrayDecoderImpl = DictByteArrayDecoderImpl; - // ---------------------------------------------------------------------- // DeltaBitPackEncoder @@ -3592,9 +3590,9 @@ std::unique_ptr MakeDictDecoder(Type::type type_num, return std::make_unique>(descr, pool); case Type::BYTE_ARRAY: if (use_large_binary_variants) { - return std::make_unique(descr, pool); + return std::make_unique>(descr, pool); } else { - return std::make_unique>(descr, pool); + return std::make_unique>(descr, pool); } case Type::FIXED_LEN_BYTE_ARRAY: return std::make_unique>(descr, pool); From 322319edea2828410c139c808362f78ea686b04e Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 15 Jun 2023 14:14:15 -0300 Subject: [PATCH 47/69] use offset_type --- cpp/src/parquet/encoding.cc | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index bf6e00763f699..4d8867e4eccf8 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1529,6 +1529,8 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { auto dict_values = reinterpret_cast(dictionary_->mutable_data()); + using offset_type = typename EncodingTraits::ArrowType::offset_type; + int32_t total_size = 0; for (int i = 0; i < dictionary_length_; ++i) { if (AddWithOverflow(total_size, dict_values[i].len, &total_size)) { @@ -1538,13 +1540,13 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size, /*shrink_to_fit=*/false)); PARQUET_THROW_NOT_OK( - byte_array_offsets_->Resize((dictionary_length_ + 1) * sizeof(int32_t), + byte_array_offsets_->Resize((dictionary_length_ + 1) * sizeof(offset_type), /*shrink_to_fit=*/false)); - int32_t offset = 0; + offset_type offset = 0; uint8_t* bytes_data = byte_array_data_->mutable_data(); - int32_t* bytes_offsets = - reinterpret_cast(byte_array_offsets_->mutable_data()); + auto* bytes_offsets = + reinterpret_cast(byte_array_offsets_->mutable_data()); for (int i = 0; i < dictionary_length_; ++i) { memcpy(bytes_data + offset, dict_values[i].ptr, dict_values[i].len); bytes_offsets[i] = offset; From 1775a7a02923aec51bc77cab948da1f598e17324 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 15 Jun 2023 14:50:54 -0300 Subject: [PATCH 48/69] address comments --- cpp/src/parquet/arrow/reader_internal.cc | 5 +++-- cpp/src/parquet/encoding.cc | 6 ++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index a294b712a7ce3..2dbe923a9d43c 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -487,8 +487,9 @@ Status TransferBinary(RecordReader* reader, MemoryPool* pool, auto chunks = binary_reader->GetBuilderChunks(); for (auto& chunk : chunks) { if (!chunk->type()->Equals(*logical_type_field->type())) { - // XXX: if a LargeBinary chunk is larger than 2GB, the MSBs of offsets - // will be lost because they are first created as int32 and then cast to int64. + // XXX: if a LargeBinary chunk is larger than 2GB and use_large_binary_variants + // is not set, the MSBs of offsets will be lost because they are first created + // as int32 and then cast to int64. ARROW_ASSIGN_OR_RAISE( chunk, ::arrow::compute::Cast(*chunk, logical_type_field->type(), cast_options, &ctx)); diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 4d8867e4eccf8..3aaedce7bc51e 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1531,11 +1531,9 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { using offset_type = typename EncodingTraits::ArrowType::offset_type; - int32_t total_size = 0; + offset_type total_size = 0; for (int i = 0; i < dictionary_length_; ++i) { - if (AddWithOverflow(total_size, dict_values[i].len, &total_size)) { - throw ParquetException("String/Binary length to large"); - } + total_size += dict_values[i].len; } PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size, /*shrink_to_fit=*/false)); From 7f6e2bf58c69cefb9d752cb83d9c8f391ad1869c Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 16 Jun 2023 08:12:46 -0300 Subject: [PATCH 49/69] address a few minor comments --- cpp/src/parquet/arrow/reader_internal.cc | 2 +- cpp/src/parquet/column_reader.cc | 5 ++--- cpp/src/parquet/encoding.cc | 4 +++- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index 2dbe923a9d43c..a1c40df747706 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -487,7 +487,7 @@ Status TransferBinary(RecordReader* reader, MemoryPool* pool, auto chunks = binary_reader->GetBuilderChunks(); for (auto& chunk : chunks) { if (!chunk->type()->Equals(*logical_type_field->type())) { - // XXX: if a LargeBinary chunk is larger than 2GB and use_large_binary_variants + // If a LargeBinary chunk is larger than 2GB and use_large_binary_variants // is not set, the MSBs of offsets will be lost because they are first created // as int32 and then cast to int64. ARROW_ASSIGN_OR_RAISE( diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 0bc7329dbd456..cf2511db04530 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -2093,11 +2093,10 @@ class FLBARecordReader : public TypedRecordReader, std::unique_ptr<::arrow::FixedSizeBinaryBuilder> builder_; }; -// TODO Below concept could be used to simplify type assertion, -// but it requires c++20 +// TODO: Below concept could be used to simplify type assertion in C++20. // template // concept ByteArrayTypeConcept = std::is_same::value || -// std::is_same::value; +// std::is_same::value; template struct IsByteArrayType : std::false_type {}; diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 3aaedce7bc51e..b81fc68be8d55 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1533,7 +1533,9 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { offset_type total_size = 0; for (int i = 0; i < dictionary_length_; ++i) { - total_size += dict_values[i].len; + if (AddWithOverflow(total_size, dict_values[i].len, &total_size)) { + throw ParquetException("String/Binary length to large"); + } } PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size, /*shrink_to_fit=*/false)); From 75fb61559fb269ee7816fa4610fe98d88cbcc18e Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 16 Jun 2023 09:54:53 -0300 Subject: [PATCH 50/69] fix DictDecoderImpl --- .../parquet/arrow/arrow_reader_writer_test.cc | 3 ++ cpp/src/parquet/arrow/schema.cc | 3 +- cpp/src/parquet/encoding.cc | 35 ++++++++++++++----- 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 7fa6d23414756..b61a0c0affe21 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -3873,6 +3873,9 @@ TEST(TestArrowParquet, LargeByteArray) { TryReadDataFile(path, ::arrow::StatusCode::NotImplemented); ArrowReaderProperties reader_properties; reader_properties.set_use_large_binary_variants(true); + reader_properties.set_read_dictionary(0, false); + TryReadDataFileWithProperties(path, reader_properties); + reader_properties.set_read_dictionary(0, true); TryReadDataFileWithProperties(path, reader_properties); } diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 799c9a244ff43..445bc017f5b30 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -462,7 +462,8 @@ struct SchemaTreeContext { bool IsDictionaryReadSupported(const ArrowType& type) { // Only supported currently for BYTE_ARRAY types - return type.id() == ::arrow::Type::BINARY || type.id() == ::arrow::Type::STRING; + return type.id() == ::arrow::Type::BINARY || type.id() == ::arrow::Type::STRING + || type.id() == ::arrow::Type::LARGE_BINARY || type.id() == ::arrow::Type::LARGE_STRING; } // ---------------------------------------------------------------------- diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index b81fc68be8d55..827309646b7f3 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1631,11 +1631,19 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { valid_bits, valid_bits_offset, num_values, null_count, [&]() { valid_bytes[i++] = 1; }, [&]() { ++i; }); - auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder); - PARQUET_THROW_NOT_OK( - binary_builder->AppendIndices(indices_buffer, num_values, valid_bytes.data())); - num_values_ -= num_values - null_count; - return num_values - null_count; + // It looks like this method is only called by ByteArray types. Previously, + // there was an unconditional cast to ::arrow::Dictionary32Builder<::arrow::BinaryType>. + // This won't work for LargeByteArrayType and the Type template argument can't be used + // unconditionally because it is not defined for several other types. + if constexpr (std::is_same_v || std::is_same_v) { + auto binary_builder = checked_cast::DictAccumulator*>(builder); + PARQUET_THROW_NOT_OK( + binary_builder->AppendIndices(indices_buffer, num_values, valid_bytes.data())); + num_values_ -= num_values - null_count; + return num_values - null_count; + } + + ParquetException::NYI("DecodeIndicesSpaced not implemented for this type"); } int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) override { @@ -1652,10 +1660,19 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { if (num_values != idx_decoder_.GetBatch(indices_buffer, num_values)) { ParquetException::EofException(); } - auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder); - PARQUET_THROW_NOT_OK(binary_builder->AppendIndices(indices_buffer, num_values)); - num_values_ -= num_values; - return num_values; + + // It looks like this method is only called by ByteArray types. Previously, + // there was an unconditional cast to ::arrow::Dictionary32Builder<::arrow::BinaryType>. + // This won't work for LargeByteArrayType and the Type template argument can't be used + // unconditionally because it is not defined for several other types. + if constexpr (std::is_same_v || std::is_same_v) { + auto binary_builder = checked_cast::DictAccumulator*>(builder); + PARQUET_THROW_NOT_OK(binary_builder->AppendIndices(indices_buffer, num_values)); + num_values_ -= num_values; + return num_values; + } + + ParquetException::NYI("DecodeIndices not implemented for this type"); } int DecodeIndices(int num_values, int32_t* indices) override { From 0801267dbea9001e53908ebb40c82a263582c458 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 16 Jun 2023 14:03:33 -0300 Subject: [PATCH 51/69] add non overflow test --- .../parquet/arrow/arrow_reader_writer_test.cc | 36 ++++++++++++++++--- cpp/src/parquet/arrow/reader.cc | 8 ++++- cpp/src/parquet/arrow/reader.h | 6 ++++ 3 files changed, 44 insertions(+), 6 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index b61a0c0affe21..954045b74d1ad 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -439,7 +439,9 @@ void DoSimpleRoundtrip(const std::shared_ptr& table, bool use_threads, int64_t row_group_size, const std::vector& column_subset, std::shared_ptr
* out, const std::shared_ptr& arrow_properties = - default_arrow_writer_properties()) { + default_arrow_writer_properties(), + const ArrowReaderProperties& arrow_reader_properties = + default_arrow_reader_properties()) { std::shared_ptr buffer; ASSERT_NO_FATAL_FAILURE( WriteTableToBuffer(table, row_group_size, arrow_properties, &buffer)); @@ -491,11 +493,14 @@ void DoRoundTripWithBatches( void CheckSimpleRoundtrip( const std::shared_ptr
& table, int64_t row_group_size, const std::shared_ptr& arrow_writer_properties = - default_arrow_writer_properties()) { + default_arrow_writer_properties(), + const ArrowReaderProperties& arrow_reader_properties = + default_arrow_reader_properties()) { std::shared_ptr
result; ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip(table, false /* use_threads */, row_group_size, {}, &result, - arrow_writer_properties)); + arrow_writer_properties, + arrow_reader_properties)); ::arrow::AssertSchemaEqual(*table->schema(), *result->schema(), /*check_metadata=*/false); ASSERT_OK(result->ValidateFull()); @@ -610,9 +615,14 @@ class ParquetIOTestBase : public ::testing::Test { } void ReaderFromSink(std::unique_ptr* out) { + return ReaderFromSink(out, default_arrow_reader_properties()); + } + + void ReaderFromSink(std::unique_ptr* out, + const ArrowReaderProperties& arrow_reader_properties) { ASSERT_OK_AND_ASSIGN(auto buffer, sink_->Finish()); ASSERT_OK_NO_THROW(OpenFile(std::make_shared(buffer), - ::arrow::default_memory_pool(), out)); + ::arrow::default_memory_pool(), arrow_reader_properties, out)); } void ReadSingleColumnFile(std::unique_ptr file_reader, @@ -661,16 +671,18 @@ class ParquetIOTestBase : public ::testing::Test { void RoundTripSingleColumn( const std::shared_ptr& values, const std::shared_ptr& expected, const std::shared_ptr<::parquet::ArrowWriterProperties>& arrow_properties, + const ArrowReaderProperties& arrow_reader_properties = default_arrow_reader_properties(), bool nullable = true) { std::shared_ptr
table = MakeSimpleTable(values, nullable); this->ResetSink(); + ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), this->sink_, values->length(), default_writer_properties(), arrow_properties)); std::shared_ptr
out; std::unique_ptr reader; - ASSERT_NO_FATAL_FAILURE(this->ReaderFromSink(&reader)); + ASSERT_NO_FATAL_FAILURE(this->ReaderFromSink(&reader, arrow_reader_properties)); const bool expect_metadata = arrow_properties->store_schema(); ASSERT_NO_FATAL_FAILURE( this->ReadTableFromFile(std::move(reader), expect_metadata, &out)); @@ -709,6 +721,12 @@ class ParquetIOTestBase : public ::testing::Test { CheckSimpleRoundtrip(table, table->num_rows()); } + void CheckRoundTrip(const std::shared_ptr
& table, + const std::shared_ptr& arrow_writer_properties, + const ArrowReaderProperties& arrow_reader_properties) { + CheckSimpleRoundtrip(table, table->num_rows(), arrow_writer_properties, arrow_reader_properties); + } + template void WriteColumn(const std::shared_ptr& schema, const std::shared_ptr& values) { @@ -1388,6 +1406,14 @@ TEST_F(TestLargeBinaryParquetIO, Basics) { const auto arrow_properties = ::parquet::ArrowWriterProperties::Builder().store_schema()->build(); this->RoundTripSingleColumn(large_array, large_array, arrow_properties); + + ArrowReaderProperties arrow_reader_properties; + arrow_reader_properties.set_use_large_binary_variants(true); + // Input is narrow array, but expected output is large array, opposite of the above tests. + // This validates narrow arrays can be read as large arrays. + this->RoundTripSingleColumn(narrow_array, large_array, + default_arrow_writer_properties(), + arrow_reader_properties); } using TestLargeStringParquetIO = TestParquetIO<::arrow::LargeStringType>; diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index c74a93f419e5c..87ec4cc9141f6 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -1374,9 +1374,15 @@ Result> FileReaderBuilder::Build() { Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile> file, MemoryPool* pool, std::unique_ptr* reader) { + return OpenFile(std::move(file), pool, default_arrow_reader_properties(), reader); +} + +Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile> file, MemoryPool* pool, + const ArrowReaderProperties& arrow_reader_properties, + std::unique_ptr* reader) { FileReaderBuilder builder; RETURN_NOT_OK(builder.Open(std::move(file))); - return builder.memory_pool(pool)->Build(reader); + return builder.properties(arrow_reader_properties)->memory_pool(pool)->Build(reader); } namespace internal { diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index 2cbd36176f5e3..0cfa6eb464927 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -369,6 +369,12 @@ ::arrow::Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile>, ::arrow::MemoryPool* allocator, std::unique_ptr* reader); +PARQUET_EXPORT +::arrow::Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile>, + ::arrow::MemoryPool* allocator, + const ArrowReaderProperties& arrow_reader_properties, + std::unique_ptr* reader); + /// @} PARQUET_EXPORT From 7f09a160730425d933a433104f09782a50633615 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 19 Jun 2023 13:19:16 -0300 Subject: [PATCH 52/69] string test --- .../parquet/arrow/arrow_reader_writer_test.cc | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 954045b74d1ad..ea709276a2a49 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -1360,6 +1360,31 @@ TEST_F(TestUInt32ParquetIO, Parquet_1_0_Compatibility) { using TestStringParquetIO = TestParquetIO<::arrow::StringType>; +TEST_F(TestStringParquetIO, Basics) { + std::shared_ptr values; + + ::arrow::StringBuilder builder; + for (size_t i = 0; i < SMALL_SIZE; i++) { + ASSERT_OK(builder.Append("abc")); + } + ASSERT_OK(builder.Finish(&values)); + + // Input is narrow array, but expected output is large array, opposite of the above tests. + // This validates narrow arrays can be read as large arrays. + this->RoundTripSingleColumn(values, values, + default_arrow_writer_properties()); + + ArrowReaderProperties arrow_reader_properties; + arrow_reader_properties.set_use_large_binary_variants(true); + + ASSERT_OK_AND_ASSIGN(std::shared_ptr casted, + ::arrow::compute::Cast(*values, ::arrow::large_utf8())); + + this->RoundTripSingleColumn(values, casted, + default_arrow_writer_properties(), + arrow_reader_properties); +} + TEST_F(TestStringParquetIO, EmptyStringColumnRequiredWrite) { std::shared_ptr values; ::arrow::StringBuilder builder; From a8d20a44552c8d9ca27f16c3ccf3cd4525a9bcdb Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 20 Jun 2023 14:21:52 -0300 Subject: [PATCH 53/69] address minor comments --- cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index ea709276a2a49..f575aa3908633 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -438,13 +438,13 @@ void CheckConfiguredRoundtrip( void DoSimpleRoundtrip(const std::shared_ptr
& table, bool use_threads, int64_t row_group_size, const std::vector& column_subset, std::shared_ptr
* out, - const std::shared_ptr& arrow_properties = + const std::shared_ptr& arrow_writer_properties = default_arrow_writer_properties(), const ArrowReaderProperties& arrow_reader_properties = default_arrow_reader_properties()) { std::shared_ptr buffer; ASSERT_NO_FATAL_FAILURE( - WriteTableToBuffer(table, row_group_size, arrow_properties, &buffer)); + WriteTableToBuffer(table, row_group_size, arrow_writer_properties, &buffer)); std::unique_ptr reader; ASSERT_OK_NO_THROW(OpenFile(std::make_shared(buffer), @@ -670,20 +670,19 @@ class ParquetIOTestBase : public ::testing::Test { void RoundTripSingleColumn( const std::shared_ptr& values, const std::shared_ptr& expected, - const std::shared_ptr<::parquet::ArrowWriterProperties>& arrow_properties, + const std::shared_ptr<::parquet::ArrowWriterProperties>& arrow_writer_properties, const ArrowReaderProperties& arrow_reader_properties = default_arrow_reader_properties(), bool nullable = true) { std::shared_ptr
table = MakeSimpleTable(values, nullable); this->ResetSink(); - ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), this->sink_, values->length(), default_writer_properties(), - arrow_properties)); + arrow_writer_properties)); std::shared_ptr
out; std::unique_ptr reader; ASSERT_NO_FATAL_FAILURE(this->ReaderFromSink(&reader, arrow_reader_properties)); - const bool expect_metadata = arrow_properties->store_schema(); + const bool expect_metadata = arrow_writer_properties->store_schema(); ASSERT_NO_FATAL_FAILURE( this->ReadTableFromFile(std::move(reader), expect_metadata, &out)); ASSERT_EQ(1, out->num_columns()); @@ -1369,8 +1368,6 @@ TEST_F(TestStringParquetIO, Basics) { } ASSERT_OK(builder.Finish(&values)); - // Input is narrow array, but expected output is large array, opposite of the above tests. - // This validates narrow arrays can be read as large arrays. this->RoundTripSingleColumn(values, values, default_arrow_writer_properties()); From 5fcf4e1a9afe615df92f36627a4b96cdf6c89e89 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 21 Jun 2023 09:01:06 -0300 Subject: [PATCH 54/69] use raw filereaderbuilder instead of adding a new openfile function --- cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 8 ++++++-- cpp/src/parquet/arrow/reader.cc | 7 +------ cpp/src/parquet/arrow/reader.h | 6 ------ 3 files changed, 7 insertions(+), 14 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index f575aa3908633..ccc5a7cec42aa 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -621,8 +621,12 @@ class ParquetIOTestBase : public ::testing::Test { void ReaderFromSink(std::unique_ptr* out, const ArrowReaderProperties& arrow_reader_properties) { ASSERT_OK_AND_ASSIGN(auto buffer, sink_->Finish()); - ASSERT_OK_NO_THROW(OpenFile(std::make_shared(buffer), - ::arrow::default_memory_pool(), arrow_reader_properties, out)); + + FileReaderBuilder builder; + + ASSERT_OK_NO_THROW(builder.Open(std::make_shared(buffer))); + + ASSERT_OK_NO_THROW(builder.properties(arrow_reader_properties)->memory_pool(::arrow::default_memory_pool())->Build(out)); } void ReadSingleColumnFile(std::unique_ptr file_reader, diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 87ec4cc9141f6..861e5b8011dae 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -1372,17 +1372,12 @@ Result> FileReaderBuilder::Build() { return out; } -Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile> file, MemoryPool* pool, - std::unique_ptr* reader) { - return OpenFile(std::move(file), pool, default_arrow_reader_properties(), reader); -} Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile> file, MemoryPool* pool, - const ArrowReaderProperties& arrow_reader_properties, std::unique_ptr* reader) { FileReaderBuilder builder; RETURN_NOT_OK(builder.Open(std::move(file))); - return builder.properties(arrow_reader_properties)->memory_pool(pool)->Build(reader); + return builder.memory_pool(pool)->Build(reader); } namespace internal { diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index 0cfa6eb464927..2cbd36176f5e3 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -369,12 +369,6 @@ ::arrow::Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile>, ::arrow::MemoryPool* allocator, std::unique_ptr* reader); -PARQUET_EXPORT -::arrow::Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile>, - ::arrow::MemoryPool* allocator, - const ArrowReaderProperties& arrow_reader_properties, - std::unique_ptr* reader); - /// @} PARQUET_EXPORT From 8901cbcacf39a7b2abdb5a2c76d97643e3dce01f Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 21 Jun 2023 09:04:26 -0300 Subject: [PATCH 55/69] rename test --- cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index ccc5a7cec42aa..0c703d6c2b44a 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -1363,7 +1363,7 @@ TEST_F(TestUInt32ParquetIO, Parquet_1_0_Compatibility) { using TestStringParquetIO = TestParquetIO<::arrow::StringType>; -TEST_F(TestStringParquetIO, Basics) { +TEST_F(TestStringParquetIO, NonOverflowStringWithUseLargeBinaryVariantsSetting) { std::shared_ptr values; ::arrow::StringBuilder builder; From dff017a223d7f7ecfd2abd7e4fbe750b3f3a6f7c Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 21 Jun 2023 10:01:01 -0300 Subject: [PATCH 56/69] update test file name --- cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 0c703d6c2b44a..1a44b2b526916 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -3920,8 +3920,9 @@ TEST(TestArrowReaderAdHoc, CorruptedSchema) { TryReadDataFile(path, ::arrow::StatusCode::IOError); } +#ifdef ARROW_WITH_BROTLI TEST(TestArrowParquet, LargeByteArray) { - auto path = test::get_data_file("chunked_string_map.parquet"); + auto path = test::get_data_file("large_string_map.brotli.parquet"); TryReadDataFile(path, ::arrow::StatusCode::NotImplemented); ArrowReaderProperties reader_properties; reader_properties.set_use_large_binary_variants(true); @@ -3930,6 +3931,7 @@ TEST(TestArrowParquet, LargeByteArray) { reader_properties.set_read_dictionary(0, true); TryReadDataFileWithProperties(path, reader_properties); } +#endif TEST(TestArrowReaderAdHoc, LARGE_MEMORY_TEST(LargeStringColumn)) { // ARROW-3762 From 232e01fdd2e479ed6a2f42fc318eb6a82a0248d5 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 21 Jun 2023 14:18:40 -0300 Subject: [PATCH 57/69] update submodule? --- cpp/submodules/parquet-testing | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index b2e7cc7551591..d79a0101d90df 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit b2e7cc755159196e3a068c8594f7acbaecfdaaac +Subproject commit d79a0101d90dfa3bbb10337626f57a3e8c4b5363 From d7d76c67363eb72edd4a900f14f251be23fa82dd Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 21 Jun 2023 14:46:05 -0300 Subject: [PATCH 58/69] aply clang-format --- .../parquet/arrow/arrow_reader_writer_test.cc | 52 +++++++++---------- cpp/src/parquet/arrow/reader.cc | 1 - cpp/src/parquet/arrow/schema.cc | 5 +- cpp/src/parquet/encoding.cc | 29 +++++++---- 4 files changed, 47 insertions(+), 40 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 1a44b2b526916..fdf5d6a43411f 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -438,8 +438,8 @@ void CheckConfiguredRoundtrip( void DoSimpleRoundtrip(const std::shared_ptr
& table, bool use_threads, int64_t row_group_size, const std::vector& column_subset, std::shared_ptr
* out, - const std::shared_ptr& arrow_writer_properties = - default_arrow_writer_properties(), + const std::shared_ptr& + arrow_writer_properties = default_arrow_writer_properties(), const ArrowReaderProperties& arrow_reader_properties = default_arrow_reader_properties()) { std::shared_ptr buffer; @@ -490,17 +490,15 @@ void DoRoundTripWithBatches( ASSERT_OK_AND_ASSIGN(*out, Table::FromRecordBatchReader(batch_reader.get())); } -void CheckSimpleRoundtrip( - const std::shared_ptr
& table, int64_t row_group_size, - const std::shared_ptr& arrow_writer_properties = - default_arrow_writer_properties(), - const ArrowReaderProperties& arrow_reader_properties = - default_arrow_reader_properties()) { +void CheckSimpleRoundtrip(const std::shared_ptr
& table, int64_t row_group_size, + const std::shared_ptr& + arrow_writer_properties = default_arrow_writer_properties(), + const ArrowReaderProperties& arrow_reader_properties = + default_arrow_reader_properties()) { std::shared_ptr
result; - ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip(table, false /* use_threads */, - row_group_size, {}, &result, - arrow_writer_properties, - arrow_reader_properties)); + ASSERT_NO_FATAL_FAILURE( + DoSimpleRoundtrip(table, false /* use_threads */, row_group_size, {}, &result, + arrow_writer_properties, arrow_reader_properties)); ::arrow::AssertSchemaEqual(*table->schema(), *result->schema(), /*check_metadata=*/false); ASSERT_OK(result->ValidateFull()); @@ -626,7 +624,9 @@ class ParquetIOTestBase : public ::testing::Test { ASSERT_OK_NO_THROW(builder.Open(std::make_shared(buffer))); - ASSERT_OK_NO_THROW(builder.properties(arrow_reader_properties)->memory_pool(::arrow::default_memory_pool())->Build(out)); + ASSERT_OK_NO_THROW(builder.properties(arrow_reader_properties) + ->memory_pool(::arrow::default_memory_pool()) + ->Build(out)); } void ReadSingleColumnFile(std::unique_ptr file_reader, @@ -675,7 +675,8 @@ class ParquetIOTestBase : public ::testing::Test { void RoundTripSingleColumn( const std::shared_ptr& values, const std::shared_ptr& expected, const std::shared_ptr<::parquet::ArrowWriterProperties>& arrow_writer_properties, - const ArrowReaderProperties& arrow_reader_properties = default_arrow_reader_properties(), + const ArrowReaderProperties& arrow_reader_properties = + default_arrow_reader_properties(), bool nullable = true) { std::shared_ptr
table = MakeSimpleTable(values, nullable); this->ResetSink(); @@ -724,10 +725,12 @@ class ParquetIOTestBase : public ::testing::Test { CheckSimpleRoundtrip(table, table->num_rows()); } - void CheckRoundTrip(const std::shared_ptr
& table, - const std::shared_ptr& arrow_writer_properties, - const ArrowReaderProperties& arrow_reader_properties) { - CheckSimpleRoundtrip(table, table->num_rows(), arrow_writer_properties, arrow_reader_properties); + void CheckRoundTrip( + const std::shared_ptr
& table, + const std::shared_ptr& arrow_writer_properties, + const ArrowReaderProperties& arrow_reader_properties) { + CheckSimpleRoundtrip(table, table->num_rows(), arrow_writer_properties, + arrow_reader_properties); } template @@ -1372,8 +1375,7 @@ TEST_F(TestStringParquetIO, NonOverflowStringWithUseLargeBinaryVariantsSetting) } ASSERT_OK(builder.Finish(&values)); - this->RoundTripSingleColumn(values, values, - default_arrow_writer_properties()); + this->RoundTripSingleColumn(values, values, default_arrow_writer_properties()); ArrowReaderProperties arrow_reader_properties; arrow_reader_properties.set_use_large_binary_variants(true); @@ -1381,8 +1383,7 @@ TEST_F(TestStringParquetIO, NonOverflowStringWithUseLargeBinaryVariantsSetting) ASSERT_OK_AND_ASSIGN(std::shared_ptr casted, ::arrow::compute::Cast(*values, ::arrow::large_utf8())); - this->RoundTripSingleColumn(values, casted, - default_arrow_writer_properties(), + this->RoundTripSingleColumn(values, casted, default_arrow_writer_properties(), arrow_reader_properties); } @@ -1435,11 +1436,10 @@ TEST_F(TestLargeBinaryParquetIO, Basics) { ArrowReaderProperties arrow_reader_properties; arrow_reader_properties.set_use_large_binary_variants(true); - // Input is narrow array, but expected output is large array, opposite of the above tests. - // This validates narrow arrays can be read as large arrays. + // Input is narrow array, but expected output is large array, opposite of the above + // tests. This validates narrow arrays can be read as large arrays. this->RoundTripSingleColumn(narrow_array, large_array, - default_arrow_writer_properties(), - arrow_reader_properties); + default_arrow_writer_properties(), arrow_reader_properties); } using TestLargeStringParquetIO = TestParquetIO<::arrow::LargeStringType>; diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 861e5b8011dae..c74a93f419e5c 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -1372,7 +1372,6 @@ Result> FileReaderBuilder::Build() { return out; } - Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile> file, MemoryPool* pool, std::unique_ptr* reader) { FileReaderBuilder builder; diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 445bc017f5b30..b58ebedb62737 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -462,8 +462,9 @@ struct SchemaTreeContext { bool IsDictionaryReadSupported(const ArrowType& type) { // Only supported currently for BYTE_ARRAY types - return type.id() == ::arrow::Type::BINARY || type.id() == ::arrow::Type::STRING - || type.id() == ::arrow::Type::LARGE_BINARY || type.id() == ::arrow::Type::LARGE_STRING; + return type.id() == ::arrow::Type::BINARY || type.id() == ::arrow::Type::STRING || + type.id() == ::arrow::Type::LARGE_BINARY || + type.id() == ::arrow::Type::LARGE_STRING; } // ---------------------------------------------------------------------- diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 827309646b7f3..7debd21dd39be 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1632,11 +1632,14 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { [&]() { valid_bytes[i++] = 1; }, [&]() { ++i; }); // It looks like this method is only called by ByteArray types. Previously, - // there was an unconditional cast to ::arrow::Dictionary32Builder<::arrow::BinaryType>. - // This won't work for LargeByteArrayType and the Type template argument can't be used - // unconditionally because it is not defined for several other types. - if constexpr (std::is_same_v || std::is_same_v) { - auto binary_builder = checked_cast::DictAccumulator*>(builder); + // there was an unconditional cast to + // ::arrow::Dictionary32Builder<::arrow::BinaryType>. This won't work for + // LargeByteArrayType and the Type template argument can't be used unconditionally + // because it is not defined for several other types. + if constexpr (std::is_same_v || + std::is_same_v) { + auto binary_builder = + checked_cast::DictAccumulator*>(builder); PARQUET_THROW_NOT_OK( binary_builder->AppendIndices(indices_buffer, num_values, valid_bytes.data())); num_values_ -= num_values - null_count; @@ -1662,11 +1665,14 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { } // It looks like this method is only called by ByteArray types. Previously, - // there was an unconditional cast to ::arrow::Dictionary32Builder<::arrow::BinaryType>. - // This won't work for LargeByteArrayType and the Type template argument can't be used - // unconditionally because it is not defined for several other types. - if constexpr (std::is_same_v || std::is_same_v) { - auto binary_builder = checked_cast::DictAccumulator*>(builder); + // there was an unconditional cast to + // ::arrow::Dictionary32Builder<::arrow::BinaryType>. This won't work for + // LargeByteArrayType and the Type template argument can't be used unconditionally + // because it is not defined for several other types. + if constexpr (std::is_same_v || + std::is_same_v) { + auto binary_builder = + checked_cast::DictAccumulator*>(builder); PARQUET_THROW_NOT_OK(binary_builder->AppendIndices(indices_buffer, num_values)); num_values_ -= num_values; return num_values; @@ -3609,7 +3615,8 @@ std::unique_ptr MakeDictDecoder(Type::type type_num, return std::make_unique>(descr, pool); case Type::BYTE_ARRAY: if (use_large_binary_variants) { - return std::make_unique>(descr, pool); + return std::make_unique>(descr, + pool); } else { return std::make_unique>(descr, pool); } From 90ceb0740c0f5dc1305e60861639738d6ff976d9 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 22 Jun 2023 11:29:10 -0300 Subject: [PATCH 59/69] address minor comments --- .../parquet/arrow/arrow_reader_writer_test.cc | 28 ++++--------------- 1 file changed, 5 insertions(+), 23 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index fdf5d6a43411f..4b6b2e3f183e5 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -439,9 +439,7 @@ void DoSimpleRoundtrip(const std::shared_ptr
& table, bool use_threads, int64_t row_group_size, const std::vector& column_subset, std::shared_ptr
* out, const std::shared_ptr& - arrow_writer_properties = default_arrow_writer_properties(), - const ArrowReaderProperties& arrow_reader_properties = - default_arrow_reader_properties()) { + arrow_writer_properties = default_arrow_writer_properties()) { std::shared_ptr buffer; ASSERT_NO_FATAL_FAILURE( WriteTableToBuffer(table, row_group_size, arrow_writer_properties, &buffer)); @@ -492,13 +490,11 @@ void DoRoundTripWithBatches( void CheckSimpleRoundtrip(const std::shared_ptr
& table, int64_t row_group_size, const std::shared_ptr& - arrow_writer_properties = default_arrow_writer_properties(), - const ArrowReaderProperties& arrow_reader_properties = - default_arrow_reader_properties()) { + arrow_writer_properties = default_arrow_writer_properties()) { std::shared_ptr
result; ASSERT_NO_FATAL_FAILURE( DoSimpleRoundtrip(table, false /* use_threads */, row_group_size, {}, &result, - arrow_writer_properties, arrow_reader_properties)); + arrow_writer_properties)); ::arrow::AssertSchemaEqual(*table->schema(), *result->schema(), /*check_metadata=*/false); ASSERT_OK(result->ValidateFull()); @@ -725,14 +721,6 @@ class ParquetIOTestBase : public ::testing::Test { CheckSimpleRoundtrip(table, table->num_rows()); } - void CheckRoundTrip( - const std::shared_ptr
& table, - const std::shared_ptr& arrow_writer_properties, - const ArrowReaderProperties& arrow_reader_properties) { - CheckSimpleRoundtrip(table, table->num_rows(), arrow_writer_properties, - arrow_reader_properties); - } - template void WriteColumn(const std::shared_ptr& schema, const std::shared_ptr& values) { @@ -1366,14 +1354,8 @@ TEST_F(TestUInt32ParquetIO, Parquet_1_0_Compatibility) { using TestStringParquetIO = TestParquetIO<::arrow::StringType>; -TEST_F(TestStringParquetIO, NonOverflowStringWithUseLargeBinaryVariantsSetting) { - std::shared_ptr values; - - ::arrow::StringBuilder builder; - for (size_t i = 0; i < SMALL_SIZE; i++) { - ASSERT_OK(builder.Append("abc")); - } - ASSERT_OK(builder.Finish(&values)); +TEST_F(TestStringParquetIO, SmallStringWithLargeBinaryVariantSetting) { + auto values = ArrayFromJSON(::arrow::utf8(), R"(["foo", "", null, "bar"])"); this->RoundTripSingleColumn(values, values, default_arrow_writer_properties()); From 03949636fc1e721fa2ed5dac75621f89efd1af1d Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 22 Jun 2023 13:47:39 -0300 Subject: [PATCH 60/69] delta & delta length for large* --- cpp/src/parquet/encoding.cc | 48 ++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 7debd21dd39be..72389c08e2e9c 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -2821,10 +2821,11 @@ std::shared_ptr DeltaLengthByteArrayEncoder::FlushValues() { // ---------------------------------------------------------------------- // DeltaLengthByteArrayDecoder -class DeltaLengthByteArrayDecoder : public DecoderImpl, - virtual public TypedDecoder { +template +class DeltaLengthByteArrayDecoderBase : public DecoderImpl, + virtual public TypedDecoder { public: - explicit DeltaLengthByteArrayDecoder(const ColumnDescriptor* descr, + explicit DeltaLengthByteArrayDecoderBase(const ColumnDescriptor* descr, MemoryPool* pool = ::arrow::default_memory_pool()) : DecoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY), len_decoder_(nullptr, pool), @@ -2875,7 +2876,7 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - typename EncodingTraits::Accumulator* out) override { + typename EncodingTraits::Accumulator* out) override { int result = 0; PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits, valid_bits_offset, out, &result)); @@ -2884,7 +2885,7 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - typename EncodingTraits::DictAccumulator* out) override { + typename EncodingTraits::DictAccumulator* out) override { ParquetException::NYI( "DecodeArrow of DictAccumulator for DeltaLengthByteArrayDecoder"); } @@ -2910,9 +2911,9 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - typename EncodingTraits::Accumulator* out, + typename EncodingTraits::Accumulator* out, int* out_num_values) { - ArrowBinaryHelper helper(out); + ArrowBinaryHelperBase helper(out); std::vector values(num_values - null_count); const int num_valid_values = Decode(values.data(), num_values - null_count); @@ -2953,6 +2954,9 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, std::shared_ptr buffered_length_; }; +using DeltaLengthByteArrayDecoder = DeltaLengthByteArrayDecoderBase; +using DeltaLengthLargeByteArrayDecoder = DeltaLengthByteArrayDecoderBase; + // ---------------------------------------------------------------------- // RLE_BOOLEAN_ENCODER @@ -3143,10 +3147,11 @@ class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder { // ---------------------------------------------------------------------- // DELTA_BYTE_ARRAY -class DeltaByteArrayDecoder : public DecoderImpl, - virtual public TypedDecoder { +template +class DeltaByteArrayDecoderBase : public DecoderImpl, + virtual public TypedDecoder { public: - explicit DeltaByteArrayDecoder(const ColumnDescriptor* descr, + explicit DeltaByteArrayDecoderBase(const ColumnDescriptor* descr, MemoryPool* pool = ::arrow::default_memory_pool()) : DecoderImpl(descr, Encoding::DELTA_BYTE_ARRAY), prefix_len_decoder_(nullptr, pool), @@ -3189,7 +3194,7 @@ class DeltaByteArrayDecoder : public DecoderImpl, int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - typename EncodingTraits::Accumulator* out) override { + typename EncodingTraits::Accumulator* out) override { int result = 0; PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits, valid_bits_offset, out, &result)); @@ -3199,7 +3204,7 @@ class DeltaByteArrayDecoder : public DecoderImpl, int DecodeArrow( int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - typename EncodingTraits::DictAccumulator* builder) override { + typename EncodingTraits::DictAccumulator* builder) override { ParquetException::NYI("DecodeArrow of DictAccumulator for DeltaByteArrayDecoder"); } @@ -3261,9 +3266,9 @@ class DeltaByteArrayDecoder : public DecoderImpl, Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - typename EncodingTraits::Accumulator* out, + typename EncodingTraits::Accumulator* out, int* out_num_values) { - ArrowBinaryHelper helper(out); + ArrowBinaryHelperBase helper(out); std::vector values(num_values); const int num_valid_values = GetInternal(values.data(), num_values - null_count); @@ -3306,6 +3311,9 @@ class DeltaByteArrayDecoder : public DecoderImpl, std::shared_ptr buffered_data_; }; +using DeltaByteArrayDecoder = DeltaByteArrayDecoderBase; +using DeltaLargeByteArrayDecoder = DeltaByteArrayDecoderBase; + // ---------------------------------------------------------------------- // BYTE_STREAM_SPLIT @@ -3576,12 +3584,20 @@ std::unique_ptr MakeDecoder(Type::type type_num, Encoding::type encodin } } else if (encoding == Encoding::DELTA_BYTE_ARRAY) { if (type_num == Type::BYTE_ARRAY) { - return std::make_unique(descr, pool); + if (use_large_binary_variants) { + return std::make_unique(descr); + } else { + return std::make_unique(descr); + } } throw ParquetException("DELTA_BYTE_ARRAY only supports BYTE_ARRAY"); } else if (encoding == Encoding::DELTA_LENGTH_BYTE_ARRAY) { if (type_num == Type::BYTE_ARRAY) { - return std::make_unique(descr, pool); + if (use_large_binary_variants) { + return std::make_unique(descr, pool); + } else { + return std::make_unique(descr, pool); + } } throw ParquetException("DELTA_LENGTH_BYTE_ARRAY only supports BYTE_ARRAY"); } else if (encoding == Encoding::RLE) { From a8df2e7a15d1851bcb33418f0f6719b9f1d0ad1e Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 22 Jun 2023 14:03:32 -0300 Subject: [PATCH 61/69] fix wrong if statements --- cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 4 +++- cpp/src/parquet/encoding.cc | 8 ++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 4b6b2e3f183e5..531d854369433 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -1354,6 +1354,7 @@ TEST_F(TestUInt32ParquetIO, Parquet_1_0_Compatibility) { using TestStringParquetIO = TestParquetIO<::arrow::StringType>; +#if defined(_WIN64) || defined(__x86_64__) TEST_F(TestStringParquetIO, SmallStringWithLargeBinaryVariantSetting) { auto values = ArrayFromJSON(::arrow::utf8(), R"(["foo", "", null, "bar"])"); @@ -1368,6 +1369,7 @@ TEST_F(TestStringParquetIO, SmallStringWithLargeBinaryVariantSetting) { this->RoundTripSingleColumn(values, casted, default_arrow_writer_properties(), arrow_reader_properties); } +#endif TEST_F(TestStringParquetIO, EmptyStringColumnRequiredWrite) { std::shared_ptr values; @@ -3902,7 +3904,7 @@ TEST(TestArrowReaderAdHoc, CorruptedSchema) { TryReadDataFile(path, ::arrow::StatusCode::IOError); } -#ifdef ARROW_WITH_BROTLI +#if defined(ARROW_WITH_BROTLI) && (defined(_WIN64) || defined(__x86_64__)) TEST(TestArrowParquet, LargeByteArray) { auto path = test::get_data_file("large_string_map.brotli.parquet"); TryReadDataFile(path, ::arrow::StatusCode::NotImplemented); diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 72389c08e2e9c..0031f48a62dd6 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3585,18 +3585,18 @@ std::unique_ptr MakeDecoder(Type::type type_num, Encoding::type encodin } else if (encoding == Encoding::DELTA_BYTE_ARRAY) { if (type_num == Type::BYTE_ARRAY) { if (use_large_binary_variants) { - return std::make_unique(descr); - } else { return std::make_unique(descr); + } else { + return std::make_unique(descr); } } throw ParquetException("DELTA_BYTE_ARRAY only supports BYTE_ARRAY"); } else if (encoding == Encoding::DELTA_LENGTH_BYTE_ARRAY) { if (type_num == Type::BYTE_ARRAY) { if (use_large_binary_variants) { - return std::make_unique(descr, pool); - } else { return std::make_unique(descr, pool); + } else { + return std::make_unique(descr, pool); } } throw ParquetException("DELTA_LENGTH_BYTE_ARRAY only supports BYTE_ARRAY"); From 2bb3b14376949c77adf85ec9f0149d23c4db7cfc Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 23 Jun 2023 09:10:06 -0300 Subject: [PATCH 62/69] Template member variable as well --- cpp/src/parquet/encoding.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 0031f48a62dd6..4a8e96948b093 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3301,7 +3301,7 @@ class DeltaByteArrayDecoderBase : public DecoderImpl, std::shared_ptr<::arrow::bit_util::BitReader> decoder_; DeltaBitPackDecoder prefix_len_decoder_; - DeltaLengthByteArrayDecoder suffix_decoder_; + DeltaLengthByteArrayDecoderBase suffix_decoder_; std::string last_value_; // string buffer for last value in previous page std::string last_value_in_previous_page_; From c114d441da3d715a24831aa56f28aadecb3552f1 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 23 Jun 2023 09:13:29 -0300 Subject: [PATCH 63/69] add docstring --- cpp/src/parquet/properties.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index e59b2e4e84254..2b027ff6ab38f 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -852,10 +852,12 @@ class PARQUET_EXPORT ArrowReaderProperties { return coerce_int96_timestamp_unit_; } + /// Set whether to use large binary variants for binary data + /// (default is false). void set_use_large_binary_variants(bool use_large_binary_variants) { use_large_binary_variants_ = use_large_binary_variants; } - + /// Return whether use large binary variants is enabled. bool use_large_binary_variants() const { return use_large_binary_variants_; } private: From d1d57989941bcf29f120671a76d9bdf141a4151f Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 23 Jun 2023 09:16:16 -0300 Subject: [PATCH 64/69] add LargeStringDictionary32Builder --- cpp/src/arrow/array/builder_dict.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index 3adf5b843b916..f46eaefc74b6f 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -725,6 +725,7 @@ using StringDictionaryBuilder = DictionaryBuilder; using BinaryDictionary32Builder = Dictionary32Builder; using StringDictionary32Builder = Dictionary32Builder; using LargeBinaryDictionary32Builder = Dictionary32Builder; +using LargeStringDictionary32Builder = Dictionary32Builder; /// @} From 0eaa60fae50d0f27000270bb017b60a47b8a87a8 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 26 Jun 2023 10:41:31 -0300 Subject: [PATCH 65/69] address a few comments --- .../parquet/arrow/arrow_reader_writer_test.cc | 31 +++++++++++++++++-- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 531d854369433..072d8c6935379 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -617,9 +617,7 @@ class ParquetIOTestBase : public ::testing::Test { ASSERT_OK_AND_ASSIGN(auto buffer, sink_->Finish()); FileReaderBuilder builder; - ASSERT_OK_NO_THROW(builder.Open(std::make_shared(buffer))); - ASSERT_OK_NO_THROW(builder.properties(arrow_reader_properties) ->memory_pool(::arrow::default_memory_pool()) ->Build(out)); @@ -4603,16 +4601,22 @@ TEST(TestArrowWriteDictionaries, NestedSubfield) { class TestArrowReadDeltaEncoding : public ::testing::Test { public: void ReadTableFromParquetFile(const std::string& file_name, + const ArrowReaderProperties& properties, std::shared_ptr
* out) { auto file = test::get_data_file(file_name); auto pool = ::arrow::default_memory_pool(); std::unique_ptr parquet_reader; ASSERT_OK(FileReader::Make(pool, ParquetFileReader::OpenFile(file, false), - &parquet_reader)); + properties, &parquet_reader)); ASSERT_OK(parquet_reader->ReadTable(out)); ASSERT_OK((*out)->ValidateFull()); } + void ReadTableFromParquetFile(const std::string& file_name, + std::shared_ptr
* out) { + return ReadTableFromParquetFile(file_name, default_arrow_reader_properties(), out); + } + void ReadTableFromCSVFile(const std::string& file_name, const ::arrow::csv::ConvertOptions& convert_options, std::shared_ptr
* out) { @@ -4660,6 +4664,27 @@ TEST_F(TestArrowReadDeltaEncoding, DeltaByteArray) { ::arrow::AssertTablesEqual(*actual_table, *expect_table, false); } +TEST_F(TestArrowReadDeltaEncoding, DeltaByteArrayWithLargeBinaryVariant) { + std::shared_ptr<::arrow::Table> actual_table, expect_table; + ArrowReaderProperties properties; + properties.set_use_large_binary_variants(true); + + ReadTableFromParquetFile("delta_byte_array.parquet", properties, &actual_table); + + auto convert_options = ::arrow::csv::ConvertOptions::Defaults(); + std::vector column_names = { + "c_customer_id", "c_salutation", "c_first_name", + "c_last_name", "c_preferred_cust_flag", "c_birth_country", + "c_login", "c_email_address", "c_last_review_date"}; + for (auto name : column_names) { + convert_options.column_types[name] = ::arrow::large_utf8(); + } + convert_options.strings_can_be_null = true; + ReadTableFromCSVFile("delta_byte_array_expect.csv", convert_options, &expect_table); + + ::arrow::AssertTablesEqual(*actual_table, *expect_table, false); +} + TEST_F(TestArrowReadDeltaEncoding, IncrementalDecodeDeltaByteArray) { auto file = test::get_data_file("delta_byte_array.parquet"); auto pool = ::arrow::default_memory_pool(); From 1e642fae8ec8f307fd327fd6e933558b05422278 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 26 Jun 2023 10:42:34 -0300 Subject: [PATCH 66/69] clang format --- .../parquet/arrow/arrow_reader_writer_test.cc | 17 ++++++++------- cpp/src/parquet/encoding.cc | 21 +++++++++---------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 072d8c6935379..e1aa14e9a9442 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -488,13 +488,14 @@ void DoRoundTripWithBatches( ASSERT_OK_AND_ASSIGN(*out, Table::FromRecordBatchReader(batch_reader.get())); } -void CheckSimpleRoundtrip(const std::shared_ptr
& table, int64_t row_group_size, - const std::shared_ptr& - arrow_writer_properties = default_arrow_writer_properties()) { +void CheckSimpleRoundtrip( + const std::shared_ptr
& table, int64_t row_group_size, + const std::shared_ptr& arrow_writer_properties = + default_arrow_writer_properties()) { std::shared_ptr
result; - ASSERT_NO_FATAL_FAILURE( - DoSimpleRoundtrip(table, false /* use_threads */, row_group_size, {}, &result, - arrow_writer_properties)); + ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip(table, false /* use_threads */, + row_group_size, {}, &result, + arrow_writer_properties)); ::arrow::AssertSchemaEqual(*table->schema(), *result->schema(), /*check_metadata=*/false); ASSERT_OK(result->ValidateFull()); @@ -4606,8 +4607,8 @@ class TestArrowReadDeltaEncoding : public ::testing::Test { auto file = test::get_data_file(file_name); auto pool = ::arrow::default_memory_pool(); std::unique_ptr parquet_reader; - ASSERT_OK(FileReader::Make(pool, ParquetFileReader::OpenFile(file, false), - properties, &parquet_reader)); + ASSERT_OK(FileReader::Make(pool, ParquetFileReader::OpenFile(file, false), properties, + &parquet_reader)); ASSERT_OK(parquet_reader->ReadTable(out)); ASSERT_OK((*out)->ValidateFull()); } diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 4a8e96948b093..3d6bb01bf9752 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -2823,10 +2823,10 @@ std::shared_ptr DeltaLengthByteArrayEncoder::FlushValues() { template class DeltaLengthByteArrayDecoderBase : public DecoderImpl, - virtual public TypedDecoder { + virtual public TypedDecoder { public: - explicit DeltaLengthByteArrayDecoderBase(const ColumnDescriptor* descr, - MemoryPool* pool = ::arrow::default_memory_pool()) + explicit DeltaLengthByteArrayDecoderBase( + const ColumnDescriptor* descr, MemoryPool* pool = ::arrow::default_memory_pool()) : DecoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY), len_decoder_(nullptr, pool), buffered_length_(AllocateBuffer(pool, 0)) {} @@ -2955,7 +2955,8 @@ class DeltaLengthByteArrayDecoderBase : public DecoderImpl, }; using DeltaLengthByteArrayDecoder = DeltaLengthByteArrayDecoderBase; -using DeltaLengthLargeByteArrayDecoder = DeltaLengthByteArrayDecoderBase; +using DeltaLengthLargeByteArrayDecoder = + DeltaLengthByteArrayDecoderBase; // ---------------------------------------------------------------------- // RLE_BOOLEAN_ENCODER @@ -3148,11 +3149,10 @@ class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder { // DELTA_BYTE_ARRAY template -class DeltaByteArrayDecoderBase : public DecoderImpl, - virtual public TypedDecoder { +class DeltaByteArrayDecoderBase : public DecoderImpl, virtual public TypedDecoder { public: explicit DeltaByteArrayDecoderBase(const ColumnDescriptor* descr, - MemoryPool* pool = ::arrow::default_memory_pool()) + MemoryPool* pool = ::arrow::default_memory_pool()) : DecoderImpl(descr, Encoding::DELTA_BYTE_ARRAY), prefix_len_decoder_(nullptr, pool), suffix_decoder_(nullptr, pool), @@ -3201,10 +3201,9 @@ class DeltaByteArrayDecoderBase : public DecoderImpl, return result; } - int DecodeArrow( - int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, - typename EncodingTraits::DictAccumulator* builder) override { + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::DictAccumulator* builder) override { ParquetException::NYI("DecodeArrow of DictAccumulator for DeltaByteArrayDecoder"); } From b299497b3d08245fcc3ff6df4e483026bdc43a67 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 26 Jun 2023 15:52:41 -0300 Subject: [PATCH 67/69] add binarypacked test for largebinaryvariant --- .../parquet/arrow/arrow_reader_writer_test.cc | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index e1aa14e9a9442..7ae5262e128a8 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -4647,6 +4647,24 @@ TEST_F(TestArrowReadDeltaEncoding, DeltaBinaryPacked) { ::arrow::AssertTablesEqual(*actual_table, *expect_table); } +TEST_F(TestArrowReadDeltaEncoding, DeltaBinaryPackedWithLargeBinaryVariant) { + std::shared_ptr<::arrow::Table> actual_table, expect_table; + ArrowReaderProperties properties; + properties.set_use_large_binary_variants(true); + + ReadTableFromParquetFile("delta_binary_packed.parquet", properties, &actual_table); + + auto convert_options = ::arrow::csv::ConvertOptions::Defaults(); + for (int i = 0; i <= 64; ++i) { + std::string column_name = "bitwidth" + std::to_string(i); + convert_options.column_types[column_name] = ::arrow::int64(); + } + convert_options.column_types["int_value"] = ::arrow::int32(); + ReadTableFromCSVFile("delta_binary_packed_expect.csv", convert_options, &expect_table); + + ::arrow::AssertTablesEqual(*actual_table, *expect_table); +} + TEST_F(TestArrowReadDeltaEncoding, DeltaByteArray) { std::shared_ptr<::arrow::Table> actual_table, expect_table; ReadTableFromParquetFile("delta_byte_array.parquet", &actual_table); From 2c23dd701f74e8586c42ea36a0bc4ae1fb34b578 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 27 Jun 2023 09:22:02 -0300 Subject: [PATCH 68/69] Revert "add binarypacked test for largebinaryvariant" This reverts commit b299497b3d08245fcc3ff6df4e483026bdc43a67. --- .../parquet/arrow/arrow_reader_writer_test.cc | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 7ae5262e128a8..e1aa14e9a9442 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -4647,24 +4647,6 @@ TEST_F(TestArrowReadDeltaEncoding, DeltaBinaryPacked) { ::arrow::AssertTablesEqual(*actual_table, *expect_table); } -TEST_F(TestArrowReadDeltaEncoding, DeltaBinaryPackedWithLargeBinaryVariant) { - std::shared_ptr<::arrow::Table> actual_table, expect_table; - ArrowReaderProperties properties; - properties.set_use_large_binary_variants(true); - - ReadTableFromParquetFile("delta_binary_packed.parquet", properties, &actual_table); - - auto convert_options = ::arrow::csv::ConvertOptions::Defaults(); - for (int i = 0; i <= 64; ++i) { - std::string column_name = "bitwidth" + std::to_string(i); - convert_options.column_types[column_name] = ::arrow::int64(); - } - convert_options.column_types["int_value"] = ::arrow::int32(); - ReadTableFromCSVFile("delta_binary_packed_expect.csv", convert_options, &expect_table); - - ::arrow::AssertTablesEqual(*actual_table, *expect_table); -} - TEST_F(TestArrowReadDeltaEncoding, DeltaByteArray) { std::shared_ptr<::arrow::Table> actual_table, expect_table; ReadTableFromParquetFile("delta_byte_array.parquet", &actual_table); From eca9d6f9bff17c6eb64eab069cdebbcfa88f976a Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 6 Jul 2023 09:10:08 -0300 Subject: [PATCH 69/69] only run largebinary tests if system is 64bit --- cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 6 ++++-- cpp/src/parquet/encoding.cc | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index e1aa14e9a9442..2f3e8953daaf0 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -1353,7 +1353,7 @@ TEST_F(TestUInt32ParquetIO, Parquet_1_0_Compatibility) { using TestStringParquetIO = TestParquetIO<::arrow::StringType>; -#if defined(_WIN64) || defined(__x86_64__) +#if defined(_WIN64) || defined(__LP64__) TEST_F(TestStringParquetIO, SmallStringWithLargeBinaryVariantSetting) { auto values = ArrayFromJSON(::arrow::utf8(), R"(["foo", "", null, "bar"])"); @@ -1397,6 +1397,7 @@ TEST_F(TestStringParquetIO, EmptyStringColumnRequiredWrite) { using TestLargeBinaryParquetIO = TestParquetIO<::arrow::LargeBinaryType>; +#if defined(_WIN64) || defined(__LP64__) TEST_F(TestLargeBinaryParquetIO, Basics) { const char* json = "[\"foo\", \"\", null, \"\xff\"]"; @@ -1447,6 +1448,7 @@ TEST_F(TestLargeStringParquetIO, Basics) { ::parquet::ArrowWriterProperties::Builder().store_schema()->build(); this->RoundTripSingleColumn(large_array, large_array, arrow_properties); } +#endif using TestNullParquetIO = TestParquetIO<::arrow::NullType>; @@ -3903,7 +3905,7 @@ TEST(TestArrowReaderAdHoc, CorruptedSchema) { TryReadDataFile(path, ::arrow::StatusCode::IOError); } -#if defined(ARROW_WITH_BROTLI) && (defined(_WIN64) || defined(__x86_64__)) +#if defined(ARROW_WITH_BROTLI) && defined(__LP64__) TEST(TestArrowParquet, LargeByteArray) { auto path = test::get_data_file("large_string_map.brotli.parquet"); TryReadDataFile(path, ::arrow::StatusCode::NotImplemented); diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 3d6bb01bf9752..bb931ecb5e929 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1534,7 +1534,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { offset_type total_size = 0; for (int i = 0; i < dictionary_length_; ++i) { if (AddWithOverflow(total_size, dict_values[i].len, &total_size)) { - throw ParquetException("String/Binary length to large"); + throw ParquetException("String/Binary length too large"); } } PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,