From e5e96ec60184968fb3b7a571f258083895f2717f Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Fri, 26 May 2023 10:42:52 -0300
Subject: [PATCH 01/69] able to read the file

---
 cpp/CMakePresets.json                         |   7 +-
 cpp/diff.output                               | 359 ++++++++++++++++++
 cpp/examples/arrow/parquet_read_write.cc      |   2 +-
 .../parquet/parquet_arrow/reader_writer.cc    |  30 +-
 cpp/src/arrow/array/builder_dict.h            |  24 ++
 cpp/src/arrow/type.h                          |   2 +-
 cpp/src/parquet/arrow/reader_internal.cc      |   3 +-
 cpp/src/parquet/arrow/schema_internal.cc      |   2 +-
 cpp/src/parquet/column_reader.cc              |   6 +-
 cpp/src/parquet/column_reader.h               |   5 +
 cpp/src/parquet/encoding.cc                   |   6 +-
 cpp/src/parquet/encoding.h                    |   8 +-
 12 files changed, 426 insertions(+), 28 deletions(-)
 create mode 100644 cpp/diff.output

diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json
index 7882be57a0534..40ccd64a93695 100644
--- a/cpp/CMakePresets.json
+++ b/cpp/CMakePresets.json
@@ -220,7 +220,12 @@
         "features-main"
       ],
       "displayName": "Debug build with tests and more optional components",
-      "cacheVariables": {}
+      "cacheVariables": {
+        "ARROW_BUILD_EXAMPLES": "ON",
+        "PARQUET_BUILD_EXAMPLES": "ON",
+        "ARROW_BUILD_TESTS": "ON",
+        "ARROW_BUILD_UTILITIES": "ON"
+      }
     },
     {
       "name": "ninja-debug-cuda",
diff --git a/cpp/diff.output b/cpp/diff.output
new file mode 100644
index 0000000000000..3030a9aba673c
--- /dev/null
+++ b/cpp/diff.output
@@ -0,0 +1,359 @@
+diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json
+index 7882be57a..40ccd64a9 100644
+--- a/cpp/CMakePresets.json
++++ b/cpp/CMakePresets.json
+@@ -220,7 +220,12 @@
+         "features-main"
+       ],
+       "displayName": "Debug build with tests and more optional components",
+-      "cacheVariables": {}
++      "cacheVariables": {
++        "ARROW_BUILD_EXAMPLES": "ON",
++        "PARQUET_BUILD_EXAMPLES": "ON",
++        "ARROW_BUILD_TESTS": "ON",
++        "ARROW_BUILD_UTILITIES": "ON"
++      }
+     },
+     {
+       "name": "ninja-debug-cuda",
+diff --git a/cpp/examples/arrow/parquet_read_write.cc b/cpp/examples/arrow/parquet_read_write.cc
+index 3b8b4c221..20fe2c20b 100644
+--- a/cpp/examples/arrow/parquet_read_write.cc
++++ b/cpp/examples/arrow/parquet_read_write.cc
+@@ -120,7 +120,7 @@ arrow::Status WriteFullFile(std::string path_to_file) {
+ 
+   ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(),
+                                                  arrow::default_memory_pool(), outfile,
+-                                                 /*chunk_size=*/3, props, arrow_props));
++                                                 /*chunk_size=*/1024*1024*1024, props, arrow_props));
+   return arrow::Status::OK();
+ }
+ 
+diff --git a/cpp/examples/parquet/parquet_arrow/reader_writer.cc b/cpp/examples/parquet/parquet_arrow/reader_writer.cc
+index f5d96ec16..b4e28c662 100644
+--- a/cpp/examples/parquet/parquet_arrow/reader_writer.cc
++++ b/cpp/examples/parquet/parquet_arrow/reader_writer.cc
+@@ -56,20 +56,22 @@ void write_parquet_file(const arrow::Table& table) {
+   // the parquet file. Normally you would choose this to be rather large but
+   // for the example, we use a small value to have multiple RowGroups.
+   PARQUET_THROW_NOT_OK(
+-      parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 3));
++      parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 1024*1024 * 1024));
+ }
+ 
+ // #2: Fully read in the file
+-void read_whole_file() {
+-  std::cout << "Reading parquet-arrow-example.parquet at once" << std::endl;
++void read_whole_file(const std::string & filename) {
++  std::cout << "Reading " << filename << " at once" << std::endl;
+   std::shared_ptr<arrow::io::ReadableFile> infile;
+   PARQUET_ASSIGN_OR_THROW(infile,
+-                          arrow::io::ReadableFile::Open("parquet-arrow-example.parquet",
++                          arrow::io::ReadableFile::Open(filename,
+                                                         arrow::default_memory_pool()));
+ 
+   std::unique_ptr<parquet::arrow::FileReader> reader;
+   PARQUET_THROW_NOT_OK(
+       parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
++
++
+   std::shared_ptr<arrow::Table> table;
+   PARQUET_THROW_NOT_OK(reader->ReadTable(&table));
+   std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns()
+@@ -94,18 +96,18 @@ void read_single_rowgroup() {
+ }
+ 
+ // #4: Read only a single column of the whole parquet file
+-void read_single_column() {
+-  std::cout << "Reading first column of parquet-arrow-example.parquet" << std::endl;
++void read_single_column(const std::string & filename) {
++  std::cout << "Reading first column of " << filename << std::endl;
+   std::shared_ptr<arrow::io::ReadableFile> infile;
+   PARQUET_ASSIGN_OR_THROW(infile,
+-                          arrow::io::ReadableFile::Open("parquet-arrow-example.parquet",
++                          arrow::io::ReadableFile::Open(filename,
+                                                         arrow::default_memory_pool()));
+ 
+   std::unique_ptr<parquet::arrow::FileReader> reader;
+   PARQUET_THROW_NOT_OK(
+       parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
+   std::shared_ptr<arrow::ChunkedArray> array;
+-  PARQUET_THROW_NOT_OK(reader->ReadColumn(0, &array));
++  PARQUET_THROW_NOT_OK(reader->ReadColumn(5, &array));
+   PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout));
+   std::cout << std::endl;
+ }
+@@ -131,10 +133,10 @@ void read_single_column_chunk() {
+ }
+ 
+ int main(int argc, char** argv) {
+-  std::shared_ptr<arrow::Table> table = generate_table();
+-  write_parquet_file(*table);
+-  read_whole_file();
+-  read_single_rowgroup();
+-  read_single_column();
+-  read_single_column_chunk();
++//  std::shared_ptr<arrow::Table> table = generate_table();
++//  write_parquet_file(*table);
++  read_whole_file("minimal_repro.parquet");
++//  read_single_rowgroup();
++//  read_single_column("minimal_repro.parquet");
++//  read_single_column_chunk();
+ }
+diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc
+index 745312f1d..3a5184d1d 100644
+--- a/cpp/src/arrow/array/array_nested.cc
++++ b/cpp/src/arrow/array/array_nested.cc
+@@ -207,8 +207,8 @@ inline void SetListData(BaseListArray<TYPE>* self, const std::shared_ptr<ArrayDa
+   self->raw_value_offsets_ =
+       data->GetValuesSafe<typename TYPE::offset_type>(1, /*offset=*/0);
+ 
+-  ARROW_CHECK_EQ(self->list_type_->value_type()->id(), data->child_data[0]->type->id());
+-  DCHECK(self->list_type_->value_type()->Equals(data->child_data[0]->type));
++//  ARROW_CHECK_EQ(self->list_type_->value_type()->id(), data->child_data[0]->type->id());
++//  DCHECK(self->list_type_->value_type()->Equals(data->child_data[0]->type));
+   self->values_ = MakeArray(self->data_->child_data[0]);
+ }
+ 
+diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc
+index 571f450aa..9754275e7 100644
+--- a/cpp/src/arrow/array/builder_binary.cc
++++ b/cpp/src/arrow/array/builder_binary.cc
+@@ -137,6 +137,7 @@ namespace internal {
+ ChunkedBinaryBuilder::ChunkedBinaryBuilder(int32_t max_chunk_value_length,
+                                            MemoryPool* pool)
+     : max_chunk_value_length_(max_chunk_value_length), builder_(new BinaryBuilder(pool)) {
++  assert(false);
+   DCHECK_LE(max_chunk_value_length, kBinaryMemoryLimit);
+ }
+ 
+diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h
+index cb0aaf309..9a248dc6f 100644
+--- a/cpp/src/arrow/array/builder_dict.h
++++ b/cpp/src/arrow/array/builder_dict.h
+@@ -715,6 +715,29 @@ class Dictionary32Builder : public internal::DictionaryBuilderBase<Int32Builder,
+   }
+ };
+ 
++/// \brief A DictionaryArray builder that always returns int64 dictionary
++/// indices so that data cast to dictionary form will have a consistent index
++/// type, e.g. for creating a ChunkedArray
++template <typename T>
++class Dictionary64Builder : public internal::DictionaryBuilderBase<Int64Builder, T> {
++ public:
++  using BASE = internal::DictionaryBuilderBase<Int64Builder, T>;
++  using BASE::BASE;
++
++  /// \brief Append dictionary indices directly without modifying memo
++  ///
++  /// NOTE: Experimental API
++  Status AppendIndices(const int64_t* values, int64_t length,
++                       const uint8_t* valid_bytes = NULLPTR) {
++    int64_t null_count_before = this->indices_builder_.null_count();
++    ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes));
++    this->capacity_ = this->indices_builder_.capacity();
++    this->length_ += length;
++    this->null_count_ += this->indices_builder_.null_count() - null_count_before;
++    return Status::OK();
++  }
++};
++
+ // ----------------------------------------------------------------------
+ // Binary / Unicode builders
+ // (compatibility aliases; those used to be derived classes with additional
+@@ -724,6 +747,7 @@ using BinaryDictionaryBuilder = DictionaryBuilder<BinaryType>;
+ using StringDictionaryBuilder = DictionaryBuilder<StringType>;
+ using BinaryDictionary32Builder = Dictionary32Builder<BinaryType>;
+ using StringDictionary32Builder = Dictionary32Builder<StringType>;
++using BinaryDictionary64Builder = Dictionary64Builder<LargeBinaryType>;
+ 
+ /// @}
+ 
+diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc
+index 0f2bd4583..7fc907986 100644
+--- a/cpp/src/arrow/array/validate.cc
++++ b/cpp/src/arrow/array/validate.cc
+@@ -298,12 +298,12 @@ struct ValidateArrayImpl {
+                                field_data.length, " < ", data.length + data.offset, ")");
+       }
+ 
+-      const auto& field_type = type.field(i)->type();
+-      if (!field_data.type->Equals(*field_type)) {
+-        return Status::Invalid("Struct child array #", i, " does not match type field: ",
+-                               field_data.type->ToString(), " vs ",
+-                               field_type->ToString());
+-      }
++//      const auto& field_type = type.field(i)->type();
++//      if (!field_data.type->Equals(*field_type)) {
++//        return Status::Invalid("Struct child array #", i, " does not match type field: ",
++//                               field_data.type->ToString(), " vs ",
++//                               field_type->ToString());
++//      }
+     }
+     return Status::OK();
+   }
+diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
+index 48228d43e..73c1a9d44 100644
+--- a/cpp/src/arrow/type.h
++++ b/cpp/src/arrow/type.h
+@@ -676,7 +676,7 @@ class ARROW_EXPORT BaseBinaryType : public DataType {
+   ~BaseBinaryType() override;
+ };
+ 
+-constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max() - 1;
++constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int64_t>::max() - 1;
+ 
+ /// \addtogroup binary-datatypes
+ ///
+diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
+index 40fbdcbb5..f7b31b67f 100644
+--- a/cpp/src/parquet/arrow/reader.cc
++++ b/cpp/src/parquet/arrow/reader.cc
+@@ -90,6 +90,8 @@ namespace {
+     case 1:
+       return chunked.chunk(0)->data();
+     default:
++//      auto flattened = chunked.Flatten().ValueOrDie();
++//      return flattened[0]->chunk(0)->data();
+       // ARROW-3762(wesm): If item reader yields a chunked array, we reject as
+       // this is not yet implemented
+       return Status::NotImplemented(
+diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc
+index a294b712a..3785eac26 100644
+--- a/cpp/src/parquet/arrow/reader_internal.cc
++++ b/cpp/src/parquet/arrow/reader_internal.cc
+@@ -85,6 +85,7 @@ using ::arrow::internal::SafeLeftShift;
+ using ::arrow::util::SafeLoadAs;
+ 
+ using parquet::internal::BinaryRecordReader;
++using parquet::internal::LargeBinaryRecordReader;
+ using parquet::internal::DictionaryRecordReader;
+ using parquet::internal::RecordReader;
+ using parquet::schema::GroupNode;
+@@ -482,7 +483,7 @@ Status TransferBinary(RecordReader* reader, MemoryPool* pool,
+   ::arrow::compute::CastOptions cast_options;
+   cast_options.allow_invalid_utf8 = true;  // avoid spending time validating UTF8 data
+ 
+-  auto binary_reader = dynamic_cast<BinaryRecordReader*>(reader);
++  auto binary_reader = dynamic_cast<LargeBinaryRecordReader*>(reader);
+   DCHECK(binary_reader);
+   auto chunks = binary_reader->GetBuilderChunks();
+   for (auto& chunk : chunks) {
+diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc
+index 064bf4f55..dbff14d93 100644
+--- a/cpp/src/parquet/arrow/schema_internal.cc
++++ b/cpp/src/parquet/arrow/schema_internal.cc
+@@ -113,7 +113,7 @@ Result<std::shared_ptr<ArrowType>> MakeArrowTimestamp(const LogicalType& logical
+ Result<std::shared_ptr<ArrowType>> FromByteArray(const LogicalType& logical_type) {
+   switch (logical_type.type()) {
+     case LogicalType::Type::STRING:
+-      return ::arrow::utf8();
++      return ::arrow::large_utf8();
+     case LogicalType::Type::DECIMAL:
+       return MakeArrowDecimal(logical_type);
+     case LogicalType::Type::NONE:
+diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
+index 3294aaaf2..3cfdb6cb8 100644
+--- a/cpp/src/parquet/column_reader.cc
++++ b/cpp/src/parquet/column_reader.cc
+@@ -2094,14 +2094,14 @@ class FLBARecordReader : public TypedRecordReader<FLBAType>,
+ };
+ 
+ class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType>,
+-                                     virtual public BinaryRecordReader {
++                                     virtual public LargeBinaryRecordReader {
+  public:
+   ByteArrayChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
+                                ::arrow::MemoryPool* pool, bool read_dense_for_nullable)
+       : TypedRecordReader<ByteArrayType>(descr, leaf_info, pool,
+                                          read_dense_for_nullable) {
+     ARROW_DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY);
+-    accumulator_.builder = std::make_unique<::arrow::BinaryBuilder>(pool);
++    accumulator_.builder = std::make_unique<::arrow::LargeBinaryBuilder>(pool);
+   }
+ 
+   ::arrow::ArrayVector GetBuilderChunks() override {
+@@ -2213,7 +2213,7 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader<ByteArrayType>,
+  private:
+   using BinaryDictDecoder = DictDecoder<ByteArrayType>;
+ 
+-  ::arrow::BinaryDictionary32Builder builder_;
++  ::arrow::BinaryDictionary64Builder builder_;
+   std::vector<std::shared_ptr<::arrow::Array>> result_chunks_;
+ };
+ 
+diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
+index 334b8bcff..b652a89d8 100644
+--- a/cpp/src/parquet/column_reader.h
++++ b/cpp/src/parquet/column_reader.h
+@@ -470,6 +470,11 @@ class BinaryRecordReader : virtual public RecordReader {
+   virtual std::vector<std::shared_ptr<::arrow::Array>> GetBuilderChunks() = 0;
+ };
+ 
++class LargeBinaryRecordReader : virtual public RecordReader {
++ public:
++  virtual std::vector<std::shared_ptr<::arrow::Array>> GetBuilderChunks() = 0;
++};
++
+ /// \brief Read records directly to dictionary-encoded Arrow form (int32
+ /// indices). Only valid for BYTE_ARRAY columns
+ class DictionaryRecordReader : virtual public RecordReader {
+diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
+index 134a22f28..b52cd3b30 100644
+--- a/cpp/src/parquet/encoding.cc
++++ b/cpp/src/parquet/encoding.cc
+@@ -1271,7 +1271,7 @@ struct ArrowBinaryHelper {
+   Status AppendNull() { return builder->AppendNull(); }
+ 
+   typename EncodingTraits<ByteArrayType>::Accumulator* out;
+-  ::arrow::BinaryBuilder* builder;
++  ::arrow::LargeBinaryBuilder* builder;
+   int64_t chunk_space_remaining;
+ };
+ 
+@@ -1349,7 +1349,7 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
+ 
+   int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                   int64_t valid_bits_offset,
+-                  ::arrow::BinaryDictionary32Builder* builder) override {
++                  ::arrow::BinaryDictionary64Builder* builder) override {
+     int result = 0;
+     PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
+                                      valid_bits_offset, builder, &result));
+@@ -1862,7 +1862,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
+ 
+   int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                   int64_t valid_bits_offset,
+-                  ::arrow::BinaryDictionary32Builder* builder) override {
++                  ::arrow::BinaryDictionary64Builder* builder) override {
+     int result = 0;
+     if (null_count == 0) {
+       PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result));
+diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h
+index 9f9b740ff..ab80284e6 100644
+--- a/cpp/src/parquet/encoding.h
++++ b/cpp/src/parquet/encoding.h
+@@ -45,6 +45,8 @@ class NumericBuilder;
+ class FixedSizeBinaryBuilder;
+ template <typename T>
+ class Dictionary32Builder;
++template <typename T>
++class Dictionary64Builder;
+ 
+ }  // namespace arrow
+ 
+@@ -144,11 +146,11 @@ struct EncodingTraits<ByteArrayType> {
+   /// \brief Internal helper class for decoding BYTE_ARRAY data where we can
+   /// overflow the capacity of a single arrow::BinaryArray
+   struct Accumulator {
+-    std::unique_ptr<::arrow::BinaryBuilder> builder;
++    std::unique_ptr<::arrow::LargeBinaryBuilder> builder;
+     std::vector<std::shared_ptr<::arrow::Array>> chunks;
+   };
+-  using ArrowType = ::arrow::BinaryType;
+-  using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>;
++  using ArrowType = ::arrow::LargeBinaryType;
++  using DictAccumulator = ::arrow::Dictionary64Builder<::arrow::LargeBinaryType>;
+ };
+ 
+ template <>
diff --git a/cpp/examples/arrow/parquet_read_write.cc b/cpp/examples/arrow/parquet_read_write.cc
index 3b8b4c2212b75..20fe2c20b291a 100644
--- a/cpp/examples/arrow/parquet_read_write.cc
+++ b/cpp/examples/arrow/parquet_read_write.cc
@@ -120,7 +120,7 @@ arrow::Status WriteFullFile(std::string path_to_file) {
 
   ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(),
                                                  arrow::default_memory_pool(), outfile,
-                                                 /*chunk_size=*/3, props, arrow_props));
+                                                 /*chunk_size=*/1024*1024*1024, props, arrow_props));
   return arrow::Status::OK();
 }
 
diff --git a/cpp/examples/parquet/parquet_arrow/reader_writer.cc b/cpp/examples/parquet/parquet_arrow/reader_writer.cc
index f5d96ec16ca64..b4e28c662a88e 100644
--- a/cpp/examples/parquet/parquet_arrow/reader_writer.cc
+++ b/cpp/examples/parquet/parquet_arrow/reader_writer.cc
@@ -56,20 +56,22 @@ void write_parquet_file(const arrow::Table& table) {
   // the parquet file. Normally you would choose this to be rather large but
   // for the example, we use a small value to have multiple RowGroups.
   PARQUET_THROW_NOT_OK(
-      parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 3));
+      parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 1024*1024 * 1024));
 }
 
 // #2: Fully read in the file
-void read_whole_file() {
-  std::cout << "Reading parquet-arrow-example.parquet at once" << std::endl;
+void read_whole_file(const std::string & filename) {
+  std::cout << "Reading " << filename << " at once" << std::endl;
   std::shared_ptr<arrow::io::ReadableFile> infile;
   PARQUET_ASSIGN_OR_THROW(infile,
-                          arrow::io::ReadableFile::Open("parquet-arrow-example.parquet",
+                          arrow::io::ReadableFile::Open(filename,
                                                         arrow::default_memory_pool()));
 
   std::unique_ptr<parquet::arrow::FileReader> reader;
   PARQUET_THROW_NOT_OK(
       parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
+
+
   std::shared_ptr<arrow::Table> table;
   PARQUET_THROW_NOT_OK(reader->ReadTable(&table));
   std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns()
@@ -94,18 +96,18 @@ void read_single_rowgroup() {
 }
 
 // #4: Read only a single column of the whole parquet file
-void read_single_column() {
-  std::cout << "Reading first column of parquet-arrow-example.parquet" << std::endl;
+void read_single_column(const std::string & filename) {
+  std::cout << "Reading first column of " << filename << std::endl;
   std::shared_ptr<arrow::io::ReadableFile> infile;
   PARQUET_ASSIGN_OR_THROW(infile,
-                          arrow::io::ReadableFile::Open("parquet-arrow-example.parquet",
+                          arrow::io::ReadableFile::Open(filename,
                                                         arrow::default_memory_pool()));
 
   std::unique_ptr<parquet::arrow::FileReader> reader;
   PARQUET_THROW_NOT_OK(
       parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
   std::shared_ptr<arrow::ChunkedArray> array;
-  PARQUET_THROW_NOT_OK(reader->ReadColumn(0, &array));
+  PARQUET_THROW_NOT_OK(reader->ReadColumn(5, &array));
   PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout));
   std::cout << std::endl;
 }
@@ -131,10 +133,10 @@ void read_single_column_chunk() {
 }
 
 int main(int argc, char** argv) {
-  std::shared_ptr<arrow::Table> table = generate_table();
-  write_parquet_file(*table);
-  read_whole_file();
-  read_single_rowgroup();
-  read_single_column();
-  read_single_column_chunk();
+//  std::shared_ptr<arrow::Table> table = generate_table();
+//  write_parquet_file(*table);
+  read_whole_file("minimal_repro.parquet");
+//  read_single_rowgroup();
+//  read_single_column("minimal_repro.parquet");
+//  read_single_column_chunk();
 }
diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h
index cb0aaf309915b..9a248dc6fe393 100644
--- a/cpp/src/arrow/array/builder_dict.h
+++ b/cpp/src/arrow/array/builder_dict.h
@@ -715,6 +715,29 @@ class Dictionary32Builder : public internal::DictionaryBuilderBase<Int32Builder,
   }
 };
 
+/// \brief A DictionaryArray builder that always returns int64 dictionary
+/// indices so that data cast to dictionary form will have a consistent index
+/// type, e.g. for creating a ChunkedArray
+template <typename T>
+class Dictionary64Builder : public internal::DictionaryBuilderBase<Int64Builder, T> {
+ public:
+  using BASE = internal::DictionaryBuilderBase<Int64Builder, T>;
+  using BASE::BASE;
+
+  /// \brief Append dictionary indices directly without modifying memo
+  ///
+  /// NOTE: Experimental API
+  Status AppendIndices(const int64_t* values, int64_t length,
+                       const uint8_t* valid_bytes = NULLPTR) {
+    int64_t null_count_before = this->indices_builder_.null_count();
+    ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes));
+    this->capacity_ = this->indices_builder_.capacity();
+    this->length_ += length;
+    this->null_count_ += this->indices_builder_.null_count() - null_count_before;
+    return Status::OK();
+  }
+};
+
 // ----------------------------------------------------------------------
 // Binary / Unicode builders
 // (compatibility aliases; those used to be derived classes with additional
@@ -724,6 +747,7 @@ using BinaryDictionaryBuilder = DictionaryBuilder<BinaryType>;
 using StringDictionaryBuilder = DictionaryBuilder<StringType>;
 using BinaryDictionary32Builder = Dictionary32Builder<BinaryType>;
 using StringDictionary32Builder = Dictionary32Builder<StringType>;
+using BinaryDictionary64Builder = Dictionary64Builder<LargeBinaryType>;
 
 /// @}
 
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 48228d43ef932..73c1a9d445398 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -676,7 +676,7 @@ class ARROW_EXPORT BaseBinaryType : public DataType {
   ~BaseBinaryType() override;
 };
 
-constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max() - 1;
+constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int64_t>::max() - 1;
 
 /// \addtogroup binary-datatypes
 ///
diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc
index a294b712a7ce3..3785eac26b284 100644
--- a/cpp/src/parquet/arrow/reader_internal.cc
+++ b/cpp/src/parquet/arrow/reader_internal.cc
@@ -85,6 +85,7 @@ using ::arrow::internal::SafeLeftShift;
 using ::arrow::util::SafeLoadAs;
 
 using parquet::internal::BinaryRecordReader;
+using parquet::internal::LargeBinaryRecordReader;
 using parquet::internal::DictionaryRecordReader;
 using parquet::internal::RecordReader;
 using parquet::schema::GroupNode;
@@ -482,7 +483,7 @@ Status TransferBinary(RecordReader* reader, MemoryPool* pool,
   ::arrow::compute::CastOptions cast_options;
   cast_options.allow_invalid_utf8 = true;  // avoid spending time validating UTF8 data
 
-  auto binary_reader = dynamic_cast<BinaryRecordReader*>(reader);
+  auto binary_reader = dynamic_cast<LargeBinaryRecordReader*>(reader);
   DCHECK(binary_reader);
   auto chunks = binary_reader->GetBuilderChunks();
   for (auto& chunk : chunks) {
diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc
index 064bf4f55cc7e..dbff14d93b84e 100644
--- a/cpp/src/parquet/arrow/schema_internal.cc
+++ b/cpp/src/parquet/arrow/schema_internal.cc
@@ -113,7 +113,7 @@ Result<std::shared_ptr<ArrowType>> MakeArrowTimestamp(const LogicalType& logical
 Result<std::shared_ptr<ArrowType>> FromByteArray(const LogicalType& logical_type) {
   switch (logical_type.type()) {
     case LogicalType::Type::STRING:
-      return ::arrow::utf8();
+      return ::arrow::large_utf8();
     case LogicalType::Type::DECIMAL:
       return MakeArrowDecimal(logical_type);
     case LogicalType::Type::NONE:
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 3294aaaf283f1..3cfdb6cb83ca1 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -2094,14 +2094,14 @@ class FLBARecordReader : public TypedRecordReader<FLBAType>,
 };
 
 class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType>,
-                                     virtual public BinaryRecordReader {
+                                     virtual public LargeBinaryRecordReader {
  public:
   ByteArrayChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
                                ::arrow::MemoryPool* pool, bool read_dense_for_nullable)
       : TypedRecordReader<ByteArrayType>(descr, leaf_info, pool,
                                          read_dense_for_nullable) {
     ARROW_DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY);
-    accumulator_.builder = std::make_unique<::arrow::BinaryBuilder>(pool);
+    accumulator_.builder = std::make_unique<::arrow::LargeBinaryBuilder>(pool);
   }
 
   ::arrow::ArrayVector GetBuilderChunks() override {
@@ -2213,7 +2213,7 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader<ByteArrayType>,
  private:
   using BinaryDictDecoder = DictDecoder<ByteArrayType>;
 
-  ::arrow::BinaryDictionary32Builder builder_;
+  ::arrow::BinaryDictionary64Builder builder_;
   std::vector<std::shared_ptr<::arrow::Array>> result_chunks_;
 };
 
diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index 334b8bcffe0b8..b652a89d8cf3b 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -470,6 +470,11 @@ class BinaryRecordReader : virtual public RecordReader {
   virtual std::vector<std::shared_ptr<::arrow::Array>> GetBuilderChunks() = 0;
 };
 
+class LargeBinaryRecordReader : virtual public RecordReader {
+ public:
+  virtual std::vector<std::shared_ptr<::arrow::Array>> GetBuilderChunks() = 0;
+};
+
 /// \brief Read records directly to dictionary-encoded Arrow form (int32
 /// indices). Only valid for BYTE_ARRAY columns
 class DictionaryRecordReader : virtual public RecordReader {
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 134a22f28412b..b52cd3b303c29 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -1271,7 +1271,7 @@ struct ArrowBinaryHelper {
   Status AppendNull() { return builder->AppendNull(); }
 
   typename EncodingTraits<ByteArrayType>::Accumulator* out;
-  ::arrow::BinaryBuilder* builder;
+  ::arrow::LargeBinaryBuilder* builder;
   int64_t chunk_space_remaining;
 };
 
@@ -1349,7 +1349,7 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
 
   int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
                   int64_t valid_bits_offset,
-                  ::arrow::BinaryDictionary32Builder* builder) override {
+                  ::arrow::BinaryDictionary64Builder* builder) override {
     int result = 0;
     PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
                                      valid_bits_offset, builder, &result));
@@ -1862,7 +1862,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
 
   int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
                   int64_t valid_bits_offset,
-                  ::arrow::BinaryDictionary32Builder* builder) override {
+                  ::arrow::BinaryDictionary64Builder* builder) override {
     int result = 0;
     if (null_count == 0) {
       PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result));
diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h
index 9f9b740ff3424..ab80284e6f83b 100644
--- a/cpp/src/parquet/encoding.h
+++ b/cpp/src/parquet/encoding.h
@@ -45,6 +45,8 @@ class NumericBuilder;
 class FixedSizeBinaryBuilder;
 template <typename T>
 class Dictionary32Builder;
+template <typename T>
+class Dictionary64Builder;
 
 }  // namespace arrow
 
@@ -144,11 +146,11 @@ struct EncodingTraits<ByteArrayType> {
   /// \brief Internal helper class for decoding BYTE_ARRAY data where we can
   /// overflow the capacity of a single arrow::BinaryArray
   struct Accumulator {
-    std::unique_ptr<::arrow::BinaryBuilder> builder;
+    std::unique_ptr<::arrow::LargeBinaryBuilder> builder;
     std::vector<std::shared_ptr<::arrow::Array>> chunks;
   };
-  using ArrowType = ::arrow::BinaryType;
-  using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>;
+  using ArrowType = ::arrow::LargeBinaryType;
+  using DictAccumulator = ::arrow::Dictionary64Builder<::arrow::LargeBinaryType>;
 };
 
 template <>

From b9b48f8eb5a61d15e5cec39d175deaf519f31654 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Fri, 26 May 2023 10:46:12 -0300
Subject: [PATCH 02/69] remove diff out

---
 cpp/diff.output | 359 ------------------------------------------------
 1 file changed, 359 deletions(-)
 delete mode 100644 cpp/diff.output

diff --git a/cpp/diff.output b/cpp/diff.output
deleted file mode 100644
index 3030a9aba673c..0000000000000
--- a/cpp/diff.output
+++ /dev/null
@@ -1,359 +0,0 @@
-diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json
-index 7882be57a..40ccd64a9 100644
---- a/cpp/CMakePresets.json
-+++ b/cpp/CMakePresets.json
-@@ -220,7 +220,12 @@
-         "features-main"
-       ],
-       "displayName": "Debug build with tests and more optional components",
--      "cacheVariables": {}
-+      "cacheVariables": {
-+        "ARROW_BUILD_EXAMPLES": "ON",
-+        "PARQUET_BUILD_EXAMPLES": "ON",
-+        "ARROW_BUILD_TESTS": "ON",
-+        "ARROW_BUILD_UTILITIES": "ON"
-+      }
-     },
-     {
-       "name": "ninja-debug-cuda",
-diff --git a/cpp/examples/arrow/parquet_read_write.cc b/cpp/examples/arrow/parquet_read_write.cc
-index 3b8b4c221..20fe2c20b 100644
---- a/cpp/examples/arrow/parquet_read_write.cc
-+++ b/cpp/examples/arrow/parquet_read_write.cc
-@@ -120,7 +120,7 @@ arrow::Status WriteFullFile(std::string path_to_file) {
- 
-   ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(),
-                                                  arrow::default_memory_pool(), outfile,
--                                                 /*chunk_size=*/3, props, arrow_props));
-+                                                 /*chunk_size=*/1024*1024*1024, props, arrow_props));
-   return arrow::Status::OK();
- }
- 
-diff --git a/cpp/examples/parquet/parquet_arrow/reader_writer.cc b/cpp/examples/parquet/parquet_arrow/reader_writer.cc
-index f5d96ec16..b4e28c662 100644
---- a/cpp/examples/parquet/parquet_arrow/reader_writer.cc
-+++ b/cpp/examples/parquet/parquet_arrow/reader_writer.cc
-@@ -56,20 +56,22 @@ void write_parquet_file(const arrow::Table& table) {
-   // the parquet file. Normally you would choose this to be rather large but
-   // for the example, we use a small value to have multiple RowGroups.
-   PARQUET_THROW_NOT_OK(
--      parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 3));
-+      parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 1024*1024 * 1024));
- }
- 
- // #2: Fully read in the file
--void read_whole_file() {
--  std::cout << "Reading parquet-arrow-example.parquet at once" << std::endl;
-+void read_whole_file(const std::string & filename) {
-+  std::cout << "Reading " << filename << " at once" << std::endl;
-   std::shared_ptr<arrow::io::ReadableFile> infile;
-   PARQUET_ASSIGN_OR_THROW(infile,
--                          arrow::io::ReadableFile::Open("parquet-arrow-example.parquet",
-+                          arrow::io::ReadableFile::Open(filename,
-                                                         arrow::default_memory_pool()));
- 
-   std::unique_ptr<parquet::arrow::FileReader> reader;
-   PARQUET_THROW_NOT_OK(
-       parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
-+
-+
-   std::shared_ptr<arrow::Table> table;
-   PARQUET_THROW_NOT_OK(reader->ReadTable(&table));
-   std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns()
-@@ -94,18 +96,18 @@ void read_single_rowgroup() {
- }
- 
- // #4: Read only a single column of the whole parquet file
--void read_single_column() {
--  std::cout << "Reading first column of parquet-arrow-example.parquet" << std::endl;
-+void read_single_column(const std::string & filename) {
-+  std::cout << "Reading first column of " << filename << std::endl;
-   std::shared_ptr<arrow::io::ReadableFile> infile;
-   PARQUET_ASSIGN_OR_THROW(infile,
--                          arrow::io::ReadableFile::Open("parquet-arrow-example.parquet",
-+                          arrow::io::ReadableFile::Open(filename,
-                                                         arrow::default_memory_pool()));
- 
-   std::unique_ptr<parquet::arrow::FileReader> reader;
-   PARQUET_THROW_NOT_OK(
-       parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
-   std::shared_ptr<arrow::ChunkedArray> array;
--  PARQUET_THROW_NOT_OK(reader->ReadColumn(0, &array));
-+  PARQUET_THROW_NOT_OK(reader->ReadColumn(5, &array));
-   PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout));
-   std::cout << std::endl;
- }
-@@ -131,10 +133,10 @@ void read_single_column_chunk() {
- }
- 
- int main(int argc, char** argv) {
--  std::shared_ptr<arrow::Table> table = generate_table();
--  write_parquet_file(*table);
--  read_whole_file();
--  read_single_rowgroup();
--  read_single_column();
--  read_single_column_chunk();
-+//  std::shared_ptr<arrow::Table> table = generate_table();
-+//  write_parquet_file(*table);
-+  read_whole_file("minimal_repro.parquet");
-+//  read_single_rowgroup();
-+//  read_single_column("minimal_repro.parquet");
-+//  read_single_column_chunk();
- }
-diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc
-index 745312f1d..3a5184d1d 100644
---- a/cpp/src/arrow/array/array_nested.cc
-+++ b/cpp/src/arrow/array/array_nested.cc
-@@ -207,8 +207,8 @@ inline void SetListData(BaseListArray<TYPE>* self, const std::shared_ptr<ArrayDa
-   self->raw_value_offsets_ =
-       data->GetValuesSafe<typename TYPE::offset_type>(1, /*offset=*/0);
- 
--  ARROW_CHECK_EQ(self->list_type_->value_type()->id(), data->child_data[0]->type->id());
--  DCHECK(self->list_type_->value_type()->Equals(data->child_data[0]->type));
-+//  ARROW_CHECK_EQ(self->list_type_->value_type()->id(), data->child_data[0]->type->id());
-+//  DCHECK(self->list_type_->value_type()->Equals(data->child_data[0]->type));
-   self->values_ = MakeArray(self->data_->child_data[0]);
- }
- 
-diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc
-index 571f450aa..9754275e7 100644
---- a/cpp/src/arrow/array/builder_binary.cc
-+++ b/cpp/src/arrow/array/builder_binary.cc
-@@ -137,6 +137,7 @@ namespace internal {
- ChunkedBinaryBuilder::ChunkedBinaryBuilder(int32_t max_chunk_value_length,
-                                            MemoryPool* pool)
-     : max_chunk_value_length_(max_chunk_value_length), builder_(new BinaryBuilder(pool)) {
-+  assert(false);
-   DCHECK_LE(max_chunk_value_length, kBinaryMemoryLimit);
- }
- 
-diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h
-index cb0aaf309..9a248dc6f 100644
---- a/cpp/src/arrow/array/builder_dict.h
-+++ b/cpp/src/arrow/array/builder_dict.h
-@@ -715,6 +715,29 @@ class Dictionary32Builder : public internal::DictionaryBuilderBase<Int32Builder,
-   }
- };
- 
-+/// \brief A DictionaryArray builder that always returns int64 dictionary
-+/// indices so that data cast to dictionary form will have a consistent index
-+/// type, e.g. for creating a ChunkedArray
-+template <typename T>
-+class Dictionary64Builder : public internal::DictionaryBuilderBase<Int64Builder, T> {
-+ public:
-+  using BASE = internal::DictionaryBuilderBase<Int64Builder, T>;
-+  using BASE::BASE;
-+
-+  /// \brief Append dictionary indices directly without modifying memo
-+  ///
-+  /// NOTE: Experimental API
-+  Status AppendIndices(const int64_t* values, int64_t length,
-+                       const uint8_t* valid_bytes = NULLPTR) {
-+    int64_t null_count_before = this->indices_builder_.null_count();
-+    ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes));
-+    this->capacity_ = this->indices_builder_.capacity();
-+    this->length_ += length;
-+    this->null_count_ += this->indices_builder_.null_count() - null_count_before;
-+    return Status::OK();
-+  }
-+};
-+
- // ----------------------------------------------------------------------
- // Binary / Unicode builders
- // (compatibility aliases; those used to be derived classes with additional
-@@ -724,6 +747,7 @@ using BinaryDictionaryBuilder = DictionaryBuilder<BinaryType>;
- using StringDictionaryBuilder = DictionaryBuilder<StringType>;
- using BinaryDictionary32Builder = Dictionary32Builder<BinaryType>;
- using StringDictionary32Builder = Dictionary32Builder<StringType>;
-+using BinaryDictionary64Builder = Dictionary64Builder<LargeBinaryType>;
- 
- /// @}
- 
-diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc
-index 0f2bd4583..7fc907986 100644
---- a/cpp/src/arrow/array/validate.cc
-+++ b/cpp/src/arrow/array/validate.cc
-@@ -298,12 +298,12 @@ struct ValidateArrayImpl {
-                                field_data.length, " < ", data.length + data.offset, ")");
-       }
- 
--      const auto& field_type = type.field(i)->type();
--      if (!field_data.type->Equals(*field_type)) {
--        return Status::Invalid("Struct child array #", i, " does not match type field: ",
--                               field_data.type->ToString(), " vs ",
--                               field_type->ToString());
--      }
-+//      const auto& field_type = type.field(i)->type();
-+//      if (!field_data.type->Equals(*field_type)) {
-+//        return Status::Invalid("Struct child array #", i, " does not match type field: ",
-+//                               field_data.type->ToString(), " vs ",
-+//                               field_type->ToString());
-+//      }
-     }
-     return Status::OK();
-   }
-diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
-index 48228d43e..73c1a9d44 100644
---- a/cpp/src/arrow/type.h
-+++ b/cpp/src/arrow/type.h
-@@ -676,7 +676,7 @@ class ARROW_EXPORT BaseBinaryType : public DataType {
-   ~BaseBinaryType() override;
- };
- 
--constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max() - 1;
-+constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int64_t>::max() - 1;
- 
- /// \addtogroup binary-datatypes
- ///
-diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
-index 40fbdcbb5..f7b31b67f 100644
---- a/cpp/src/parquet/arrow/reader.cc
-+++ b/cpp/src/parquet/arrow/reader.cc
-@@ -90,6 +90,8 @@ namespace {
-     case 1:
-       return chunked.chunk(0)->data();
-     default:
-+//      auto flattened = chunked.Flatten().ValueOrDie();
-+//      return flattened[0]->chunk(0)->data();
-       // ARROW-3762(wesm): If item reader yields a chunked array, we reject as
-       // this is not yet implemented
-       return Status::NotImplemented(
-diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc
-index a294b712a..3785eac26 100644
---- a/cpp/src/parquet/arrow/reader_internal.cc
-+++ b/cpp/src/parquet/arrow/reader_internal.cc
-@@ -85,6 +85,7 @@ using ::arrow::internal::SafeLeftShift;
- using ::arrow::util::SafeLoadAs;
- 
- using parquet::internal::BinaryRecordReader;
-+using parquet::internal::LargeBinaryRecordReader;
- using parquet::internal::DictionaryRecordReader;
- using parquet::internal::RecordReader;
- using parquet::schema::GroupNode;
-@@ -482,7 +483,7 @@ Status TransferBinary(RecordReader* reader, MemoryPool* pool,
-   ::arrow::compute::CastOptions cast_options;
-   cast_options.allow_invalid_utf8 = true;  // avoid spending time validating UTF8 data
- 
--  auto binary_reader = dynamic_cast<BinaryRecordReader*>(reader);
-+  auto binary_reader = dynamic_cast<LargeBinaryRecordReader*>(reader);
-   DCHECK(binary_reader);
-   auto chunks = binary_reader->GetBuilderChunks();
-   for (auto& chunk : chunks) {
-diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc
-index 064bf4f55..dbff14d93 100644
---- a/cpp/src/parquet/arrow/schema_internal.cc
-+++ b/cpp/src/parquet/arrow/schema_internal.cc
-@@ -113,7 +113,7 @@ Result<std::shared_ptr<ArrowType>> MakeArrowTimestamp(const LogicalType& logical
- Result<std::shared_ptr<ArrowType>> FromByteArray(const LogicalType& logical_type) {
-   switch (logical_type.type()) {
-     case LogicalType::Type::STRING:
--      return ::arrow::utf8();
-+      return ::arrow::large_utf8();
-     case LogicalType::Type::DECIMAL:
-       return MakeArrowDecimal(logical_type);
-     case LogicalType::Type::NONE:
-diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
-index 3294aaaf2..3cfdb6cb8 100644
---- a/cpp/src/parquet/column_reader.cc
-+++ b/cpp/src/parquet/column_reader.cc
-@@ -2094,14 +2094,14 @@ class FLBARecordReader : public TypedRecordReader<FLBAType>,
- };
- 
- class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType>,
--                                     virtual public BinaryRecordReader {
-+                                     virtual public LargeBinaryRecordReader {
-  public:
-   ByteArrayChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
-                                ::arrow::MemoryPool* pool, bool read_dense_for_nullable)
-       : TypedRecordReader<ByteArrayType>(descr, leaf_info, pool,
-                                          read_dense_for_nullable) {
-     ARROW_DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY);
--    accumulator_.builder = std::make_unique<::arrow::BinaryBuilder>(pool);
-+    accumulator_.builder = std::make_unique<::arrow::LargeBinaryBuilder>(pool);
-   }
- 
-   ::arrow::ArrayVector GetBuilderChunks() override {
-@@ -2213,7 +2213,7 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader<ByteArrayType>,
-  private:
-   using BinaryDictDecoder = DictDecoder<ByteArrayType>;
- 
--  ::arrow::BinaryDictionary32Builder builder_;
-+  ::arrow::BinaryDictionary64Builder builder_;
-   std::vector<std::shared_ptr<::arrow::Array>> result_chunks_;
- };
- 
-diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
-index 334b8bcff..b652a89d8 100644
---- a/cpp/src/parquet/column_reader.h
-+++ b/cpp/src/parquet/column_reader.h
-@@ -470,6 +470,11 @@ class BinaryRecordReader : virtual public RecordReader {
-   virtual std::vector<std::shared_ptr<::arrow::Array>> GetBuilderChunks() = 0;
- };
- 
-+class LargeBinaryRecordReader : virtual public RecordReader {
-+ public:
-+  virtual std::vector<std::shared_ptr<::arrow::Array>> GetBuilderChunks() = 0;
-+};
-+
- /// \brief Read records directly to dictionary-encoded Arrow form (int32
- /// indices). Only valid for BYTE_ARRAY columns
- class DictionaryRecordReader : virtual public RecordReader {
-diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
-index 134a22f28..b52cd3b30 100644
---- a/cpp/src/parquet/encoding.cc
-+++ b/cpp/src/parquet/encoding.cc
-@@ -1271,7 +1271,7 @@ struct ArrowBinaryHelper {
-   Status AppendNull() { return builder->AppendNull(); }
- 
-   typename EncodingTraits<ByteArrayType>::Accumulator* out;
--  ::arrow::BinaryBuilder* builder;
-+  ::arrow::LargeBinaryBuilder* builder;
-   int64_t chunk_space_remaining;
- };
- 
-@@ -1349,7 +1349,7 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
- 
-   int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
-                   int64_t valid_bits_offset,
--                  ::arrow::BinaryDictionary32Builder* builder) override {
-+                  ::arrow::BinaryDictionary64Builder* builder) override {
-     int result = 0;
-     PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
-                                      valid_bits_offset, builder, &result));
-@@ -1862,7 +1862,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
- 
-   int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
-                   int64_t valid_bits_offset,
--                  ::arrow::BinaryDictionary32Builder* builder) override {
-+                  ::arrow::BinaryDictionary64Builder* builder) override {
-     int result = 0;
-     if (null_count == 0) {
-       PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result));
-diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h
-index 9f9b740ff..ab80284e6 100644
---- a/cpp/src/parquet/encoding.h
-+++ b/cpp/src/parquet/encoding.h
-@@ -45,6 +45,8 @@ class NumericBuilder;
- class FixedSizeBinaryBuilder;
- template <typename T>
- class Dictionary32Builder;
-+template <typename T>
-+class Dictionary64Builder;
- 
- }  // namespace arrow
- 
-@@ -144,11 +146,11 @@ struct EncodingTraits<ByteArrayType> {
-   /// \brief Internal helper class for decoding BYTE_ARRAY data where we can
-   /// overflow the capacity of a single arrow::BinaryArray
-   struct Accumulator {
--    std::unique_ptr<::arrow::BinaryBuilder> builder;
-+    std::unique_ptr<::arrow::LargeBinaryBuilder> builder;
-     std::vector<std::shared_ptr<::arrow::Array>> chunks;
-   };
--  using ArrowType = ::arrow::BinaryType;
--  using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>;
-+  using ArrowType = ::arrow::LargeBinaryType;
-+  using DictAccumulator = ::arrow::Dictionary64Builder<::arrow::LargeBinaryType>;
- };
- 
- template <>

From ae62954a0ebd915ff9f81b74406db297107d8a0c Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Mon, 29 May 2023 10:34:31 -0300
Subject: [PATCH 03/69] intermediate stage, not working properly anymore..

---
 .../parquet/parquet_arrow/reader_writer.cc    |  14 +-
 cpp/src/arrow/memory_pool.cc                  |   3 +
 cpp/src/arrow/type.h                          |   5 +-
 cpp/src/parquet/arrow/reader.cc               |   2 +-
 cpp/src/parquet/arrow/reader_internal.cc      |   2 +-
 cpp/src/parquet/column_reader.cc              | 195 ++++++--
 cpp/src/parquet/column_reader.h               |   5 +-
 cpp/src/parquet/encoding.cc                   | 445 +++++++++++++++++-
 cpp/src/parquet/encoding.h                    |  17 +
 cpp/src/parquet/properties.h                  |  16 +
 cpp/src/parquet/types.h                       |  32 ++
 11 files changed, 695 insertions(+), 41 deletions(-)

diff --git a/cpp/examples/parquet/parquet_arrow/reader_writer.cc b/cpp/examples/parquet/parquet_arrow/reader_writer.cc
index b4e28c662a88e..8357f380d106e 100644
--- a/cpp/examples/parquet/parquet_arrow/reader_writer.cc
+++ b/cpp/examples/parquet/parquet_arrow/reader_writer.cc
@@ -68,9 +68,19 @@ void read_whole_file(const std::string & filename) {
                                                         arrow::default_memory_pool()));
 
   std::unique_ptr<parquet::arrow::FileReader> reader;
-  PARQUET_THROW_NOT_OK(
-      parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
 
+  parquet::arrow::FileReaderBuilder builder;
+
+  parquet::ReaderProperties props = parquet::default_reader_properties();
+
+//  props.set_use_binary_large_variants(true);
+
+  PARQUET_THROW_NOT_OK(builder.Open(infile, props));
+
+  PARQUET_THROW_NOT_OK(builder.Build(&reader));
+
+//  PARQUET_THROW_NOT_OK(
+//      parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
 
   std::shared_ptr<arrow::Table> table;
   PARQUET_THROW_NOT_OK(reader->ReadTable(&table));
diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc
index 843329c17bc28..dd14953f7ff47 100644
--- a/cpp/src/arrow/memory_pool.cc
+++ b/cpp/src/arrow/memory_pool.cc
@@ -888,6 +888,9 @@ class PoolBuffer final : public ResizableBuffer {
         capacity_ = new_capacity;
       }
     } else {
+      if (new_size > static_cast<int64_t>(pow(2, 59))) {
+        assert(false);
+      }
       RETURN_NOT_OK(Reserve(new_size));
     }
     size_ = new_size;
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 73c1a9d445398..820312e7a0c77 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -19,6 +19,7 @@
 
 #include <atomic>
 #include <climits>
+#include <cmath>
 #include <cstdint>
 #include <iosfwd>
 #include <limits>
@@ -676,7 +677,9 @@ class ARROW_EXPORT BaseBinaryType : public DataType {
   ~BaseBinaryType() override;
 };
 
-constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int64_t>::max() - 1;
+constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max() - 1;
+
+constexpr int64_t kLargeBinaryMemoryLimit = std::numeric_limits<int32_t>::max() - 1; // 2^35
 
 /// \addtogroup binary-datatypes
 ///
diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 40fbdcbb562b1..09b3c8c5f8fdd 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -462,7 +462,7 @@ class LeafReader : public ColumnReaderImpl {
         input_(std::move(input)),
         descr_(input_->descr()) {
     record_reader_ = RecordReader::Make(
-        descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY);
+        descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY, false, true /* large variants*/);
     NextRowGroup();
   }
 
diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc
index 3785eac26b284..b9c913bc24291 100644
--- a/cpp/src/parquet/arrow/reader_internal.cc
+++ b/cpp/src/parquet/arrow/reader_internal.cc
@@ -483,7 +483,7 @@ Status TransferBinary(RecordReader* reader, MemoryPool* pool,
   ::arrow::compute::CastOptions cast_options;
   cast_options.allow_invalid_utf8 = true;  // avoid spending time validating UTF8 data
 
-  auto binary_reader = dynamic_cast<LargeBinaryRecordReader*>(reader);
+  auto binary_reader = dynamic_cast<BinaryRecordReader*>(reader);
   DCHECK(binary_reader);
   auto chunks = binary_reader->GetBuilderChunks();
   for (auto& chunk : chunks) {
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 3cfdb6cb83ca1..360c506dcff4a 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -1988,33 +1988,33 @@ class TypedRecordReader : public TypedColumnReaderImpl<DType>,
   }
 
   void DebugPrintState() override {
-    const int16_t* def_levels = this->def_levels();
-    const int16_t* rep_levels = this->rep_levels();
-    const int64_t total_levels_read = levels_position_;
-
-    const T* vals = reinterpret_cast<const T*>(this->values());
-
-    if (leaf_info_.def_level > 0) {
-      std::cout << "def levels: ";
-      for (int64_t i = 0; i < total_levels_read; ++i) {
-        std::cout << def_levels[i] << " ";
-      }
-      std::cout << std::endl;
-    }
-
-    if (leaf_info_.rep_level > 0) {
-      std::cout << "rep levels: ";
-      for (int64_t i = 0; i < total_levels_read; ++i) {
-        std::cout << rep_levels[i] << " ";
-      }
-      std::cout << std::endl;
-    }
-
-    std::cout << "values: ";
-    for (int64_t i = 0; i < this->values_written(); ++i) {
-      std::cout << vals[i] << " ";
-    }
-    std::cout << std::endl;
+//    const int16_t* def_levels = this->def_levels();
+//    const int16_t* rep_levels = this->rep_levels();
+//    const int64_t total_levels_read = levels_position_;
+//
+//    const T* vals = reinterpret_cast<const T*>(this->values());
+//
+//    if (leaf_info_.def_level > 0) {
+//      std::cout << "def levels: ";
+//      for (int64_t i = 0; i < total_levels_read; ++i) {
+//        std::cout << def_levels[i] << " ";
+//      }
+//      std::cout << std::endl;
+//    }
+//
+//    if (leaf_info_.rep_level > 0) {
+//      std::cout << "rep levels: ";
+//      for (int64_t i = 0; i < total_levels_read; ++i) {
+//        std::cout << rep_levels[i] << " ";
+//      }
+//      std::cout << std::endl;
+//    }
+//
+//    std::cout << "values: ";
+//    for (int64_t i = 0; i < this->values_written(); ++i) {
+////      std::cout << vals[i] << " ";
+//    }
+//    std::cout << std::endl;
   }
 
   void ResetValues() {
@@ -2094,14 +2094,14 @@ class FLBARecordReader : public TypedRecordReader<FLBAType>,
 };
 
 class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType>,
-                                     virtual public LargeBinaryRecordReader {
+                                     virtual public BinaryRecordReader {
  public:
   ByteArrayChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
                                ::arrow::MemoryPool* pool, bool read_dense_for_nullable)
       : TypedRecordReader<ByteArrayType>(descr, leaf_info, pool,
                                          read_dense_for_nullable) {
     ARROW_DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY);
-    accumulator_.builder = std::make_unique<::arrow::LargeBinaryBuilder>(pool);
+    accumulator_.builder = std::make_unique<::arrow::BinaryBuilder>(pool);
   }
 
   ::arrow::ArrayVector GetBuilderChunks() override {
@@ -2135,6 +2135,48 @@ class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType>,
   typename EncodingTraits<ByteArrayType>::Accumulator accumulator_;
 };
 
+class LargeByteArrayChunkedRecordReader : public TypedRecordReader<LargeByteArrayType>,
+                                          virtual public LargeBinaryRecordReader {
+ public:
+  LargeByteArrayChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
+                               ::arrow::MemoryPool* pool, bool read_dense_for_nullable)
+      : TypedRecordReader<LargeByteArrayType>(descr, leaf_info, pool,
+                                         read_dense_for_nullable) {
+    ARROW_DCHECK_EQ(descr_->physical_type(), Type::LARGE_BYTE_ARRAY);
+    accumulator_.builder = std::make_unique<::arrow::LargeBinaryBuilder>(pool);
+  }
+
+  ::arrow::ArrayVector GetBuilderChunks() override {
+    ::arrow::ArrayVector result = accumulator_.chunks;
+    if (result.size() == 0 || accumulator_.builder->length() > 0) {
+      std::shared_ptr<::arrow::Array> last_chunk;
+      PARQUET_THROW_NOT_OK(accumulator_.builder->Finish(&last_chunk));
+      result.push_back(std::move(last_chunk));
+    }
+    accumulator_.chunks = {};
+    return result;
+  }
+
+  void ReadValuesDense(int64_t values_to_read) override {
+    int64_t num_decoded = this->current_decoder_->DecodeArrowNonNull(
+        static_cast<int>(values_to_read), &accumulator_);
+    CheckNumberDecoded(num_decoded, values_to_read);
+    ResetValues();
+  }
+
+  void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override {
+    int64_t num_decoded = this->current_decoder_->DecodeArrow(
+        static_cast<int>(values_to_read), static_cast<int>(null_count),
+        valid_bits_->mutable_data(), values_written_, &accumulator_);
+    CheckNumberDecoded(num_decoded, values_to_read - null_count);
+    ResetValues();
+  }
+
+ private:
+  // Helper data structure for accumulating builder chunks
+  typename EncodingTraits<LargeByteArrayType>::Accumulator accumulator_;
+};
+
 class ByteArrayDictionaryRecordReader : public TypedRecordReader<ByteArrayType>,
                                         virtual public DictionaryRecordReader {
  public:
@@ -2213,6 +2255,88 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader<ByteArrayType>,
  private:
   using BinaryDictDecoder = DictDecoder<ByteArrayType>;
 
+  ::arrow::BinaryDictionary32Builder builder_;
+  std::vector<std::shared_ptr<::arrow::Array>> result_chunks_;
+};
+
+class LargeByteArrayDictionaryRecordReader : public TypedRecordReader<LargeByteArrayType>,
+                                            virtual public DictionaryRecordReader {
+ public:
+  LargeByteArrayDictionaryRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
+                                  ::arrow::MemoryPool* pool, bool read_dense_for_nullable)
+      : TypedRecordReader<LargeByteArrayType>(descr, leaf_info, pool, read_dense_for_nullable),
+        builder_(pool) {
+    this->read_dictionary_ = true;
+  }
+
+  std::shared_ptr<::arrow::ChunkedArray> GetResult() override {
+    FlushBuilder();
+    std::vector<std::shared_ptr<::arrow::Array>> result;
+    std::swap(result, result_chunks_);
+    return std::make_shared<::arrow::ChunkedArray>(std::move(result), builder_.type());
+  }
+
+  void FlushBuilder() {
+    if (builder_.length() > 0) {
+      std::shared_ptr<::arrow::Array> chunk;
+      PARQUET_THROW_NOT_OK(builder_.Finish(&chunk));
+      result_chunks_.emplace_back(std::move(chunk));
+
+      // Also clears the dictionary memo table
+      builder_.Reset();
+    }
+  }
+
+  void MaybeWriteNewDictionary() {
+    if (this->new_dictionary_) {
+      /// If there is a new dictionary, we may need to flush the builder, then
+      /// insert the new dictionary values
+      FlushBuilder();
+      builder_.ResetFull();
+      auto decoder = dynamic_cast<LargeBinaryDictDecoder*>(this->current_decoder_);
+      decoder->InsertDictionary(&builder_);
+      this->new_dictionary_ = false;
+    }
+  }
+
+  void ReadValuesDense(int64_t values_to_read) override {
+    int64_t num_decoded = 0;
+    if (current_encoding_ == Encoding::RLE_DICTIONARY) {
+      MaybeWriteNewDictionary();
+      auto decoder = dynamic_cast<LargeBinaryDictDecoder*>(this->current_decoder_);
+      num_decoded = decoder->DecodeIndices(static_cast<int>(values_to_read), &builder_);
+    } else {
+      num_decoded = this->current_decoder_->DecodeArrowNonNull(
+          static_cast<int>(values_to_read), &builder_);
+
+      /// Flush values since they have been copied into the builder
+      ResetValues();
+    }
+    CheckNumberDecoded(num_decoded, values_to_read);
+  }
+
+  void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override {
+    int64_t num_decoded = 0;
+    if (current_encoding_ == Encoding::RLE_DICTIONARY) {
+      MaybeWriteNewDictionary();
+      auto decoder = dynamic_cast<LargeBinaryDictDecoder*>(this->current_decoder_);
+      num_decoded = decoder->DecodeIndicesSpaced(
+          static_cast<int>(values_to_read), static_cast<int>(null_count),
+          valid_bits_->mutable_data(), values_written_, &builder_);
+    } else {
+      num_decoded = this->current_decoder_->DecodeArrow(
+          static_cast<int>(values_to_read), static_cast<int>(null_count),
+          valid_bits_->mutable_data(), values_written_, &builder_);
+
+      /// Flush values since they have been copied into the builder
+      ResetValues();
+    }
+    ARROW_DCHECK_EQ(num_decoded, values_to_read - null_count);
+  }
+
+ private:
+  using LargeBinaryDictDecoder = DictDecoder<LargeByteArrayType>;
+
   ::arrow::BinaryDictionary64Builder builder_;
   std::vector<std::shared_ptr<::arrow::Array>> result_chunks_;
 };
@@ -2231,11 +2355,17 @@ std::shared_ptr<RecordReader> MakeByteArrayRecordReader(const ColumnDescriptor*
                                                         LevelInfo leaf_info,
                                                         ::arrow::MemoryPool* pool,
                                                         bool read_dictionary,
-                                                        bool read_dense_for_nullable) {
+                                                        bool read_dense_for_nullable,
+                                                        bool use_binary_string_large_variants) {
   if (read_dictionary) {
     return std::make_shared<ByteArrayDictionaryRecordReader>(descr, leaf_info, pool,
                                                              read_dense_for_nullable);
   } else {
+    if (use_binary_string_large_variants) {
+      return std::make_shared<LargeByteArrayDictionaryRecordReader>(
+          descr, leaf_info, pool, read_dense_for_nullable);
+    }
+
     return std::make_shared<ByteArrayChunkedRecordReader>(descr, leaf_info, pool,
                                                           read_dense_for_nullable);
   }
@@ -2246,7 +2376,8 @@ std::shared_ptr<RecordReader> MakeByteArrayRecordReader(const ColumnDescriptor*
 std::shared_ptr<RecordReader> RecordReader::Make(const ColumnDescriptor* descr,
                                                  LevelInfo leaf_info, MemoryPool* pool,
                                                  bool read_dictionary,
-                                                 bool read_dense_for_nullable) {
+                                                 bool read_dense_for_nullable,
+                                                 bool use_binary_string_large_variants) {
   switch (descr->physical_type()) {
     case Type::BOOLEAN:
       return std::make_shared<TypedRecordReader<BooleanType>>(descr, leaf_info, pool,
@@ -2268,7 +2399,7 @@ std::shared_ptr<RecordReader> RecordReader::Make(const ColumnDescriptor* descr,
                                                              read_dense_for_nullable);
     case Type::BYTE_ARRAY: {
       return MakeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary,
-                                       read_dense_for_nullable);
+                                       read_dense_for_nullable, use_binary_string_large_variants);
     }
     case Type::FIXED_LEN_BYTE_ARRAY:
       return std::make_shared<FLBARecordReader>(descr, leaf_info, pool,
diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index b652a89d8cf3b..94f42e7db6563 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -321,7 +321,8 @@ class PARQUET_EXPORT RecordReader {
   static std::shared_ptr<RecordReader> Make(
       const ColumnDescriptor* descr, LevelInfo leaf_info,
       ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
-      bool read_dictionary = false, bool read_dense_for_nullable = false);
+      bool read_dictionary = false, bool read_dense_for_nullable = false,
+      bool use_binary_string_large_variants = false);
 
   virtual ~RecordReader() = default;
 
@@ -470,7 +471,7 @@ class BinaryRecordReader : virtual public RecordReader {
   virtual std::vector<std::shared_ptr<::arrow::Array>> GetBuilderChunks() = 0;
 };
 
-class LargeBinaryRecordReader : virtual public RecordReader {
+class LargeBinaryRecordReader : virtual public BinaryRecordReader {
  public:
   virtual std::vector<std::shared_ptr<::arrow::Array>> GetBuilderChunks() = 0;
 };
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index b52cd3b303c29..8e350f39f1393 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -1271,6 +1271,43 @@ struct ArrowBinaryHelper {
   Status AppendNull() { return builder->AppendNull(); }
 
   typename EncodingTraits<ByteArrayType>::Accumulator* out;
+  ::arrow::BinaryBuilder* builder;
+  int64_t chunk_space_remaining;
+};
+
+struct ArrowLargeBinaryHelper {
+  explicit ArrowLargeBinaryHelper(typename EncodingTraits<LargeByteArrayType>::Accumulator* out) {
+    this->out = out;
+    this->builder = out->builder.get();
+    this->chunk_space_remaining =
+        ::arrow::kLargeBinaryMemoryLimit - this->builder->value_data_length();
+  }
+
+  Status PushChunk() {
+    std::shared_ptr<::arrow::Array> result;
+    RETURN_NOT_OK(builder->Finish(&result));
+    out->chunks.push_back(result);
+    chunk_space_remaining = ::arrow::kLargeBinaryMemoryLimit;
+    return Status::OK();
+  }
+
+  bool CanFit(int64_t length) const { return length <= chunk_space_remaining; }
+
+  void UnsafeAppend(const uint8_t* data, int64_t length) {
+    chunk_space_remaining -= length;
+    builder->UnsafeAppend(data, length);
+  }
+
+  void UnsafeAppendNull() { builder->UnsafeAppendNull(); }
+
+  Status Append(const uint8_t* data, int64_t length) {
+    chunk_space_remaining -= length;
+    return builder->Append(data, length);
+  }
+
+  Status AppendNull() { return builder->AppendNull(); }
+
+  typename EncodingTraits<LargeByteArrayType>::Accumulator* out;
   ::arrow::LargeBinaryBuilder* builder;
   int64_t chunk_space_remaining;
 };
@@ -1289,6 +1326,20 @@ inline int PlainDecoder<ByteArrayType>::DecodeArrow(
   ParquetException::NYI();
 }
 
+template <>
+inline int PlainDecoder<LargeByteArrayType>::DecodeArrow(
+    int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+    typename EncodingTraits<LargeByteArrayType>::Accumulator* builder) {
+  ParquetException::NYI();
+}
+
+template <>
+inline int PlainDecoder<LargeByteArrayType>::DecodeArrow(
+    int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+    typename EncodingTraits<LargeByteArrayType>::DictAccumulator* builder) {
+  ParquetException::NYI();
+}
+
 template <>
 inline int PlainDecoder<FLBAType>::DecodeArrow(
     int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
@@ -1349,7 +1400,7 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
 
   int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
                   int64_t valid_bits_offset,
-                  ::arrow::BinaryDictionary64Builder* builder) override {
+                  ::arrow::BinaryDictionary32Builder* builder) override {
     int result = 0;
     PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
                                      valid_bits_offset, builder, &result));
@@ -1455,6 +1506,124 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
   }
 };
 
+class PlainLargeByteArrayDecoder : public PlainDecoder<LargeByteArrayType>,
+                                   virtual public LargeByteArrayDecoder {
+ public:
+  using Base = PlainDecoder<LargeByteArrayType>;
+  using Base::DecodeSpaced;
+  using Base::PlainDecoder;
+
+  // ----------------------------------------------------------------------
+  // Dictionary read paths
+
+  int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                  int64_t valid_bits_offset,
+                  ::arrow::BinaryDictionary64Builder* builder) override {
+    int result = 0;
+    PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
+                                     valid_bits_offset, builder, &result));
+    return result;
+  }
+
+  // ----------------------------------------------------------------------
+  // Optimized dense binary read paths
+
+  int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                  int64_t valid_bits_offset,
+                  typename EncodingTraits<LargeByteArrayType>::Accumulator* out) override {
+    int result = 0;
+    PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits,
+                                          valid_bits_offset, out, &result));
+    return result;
+  }
+
+ private:
+  Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits,
+                          int64_t valid_bits_offset,
+                          typename EncodingTraits<LargeByteArrayType>::Accumulator* out,
+                          int* out_values_decoded) {
+    ArrowLargeBinaryHelper helper(out);
+    int values_decoded = 0;
+
+    RETURN_NOT_OK(helper.builder->Reserve(num_values));
+    RETURN_NOT_OK(helper.builder->ReserveData(
+        std::min<int64_t>(len_, helper.chunk_space_remaining)));
+
+    int i = 0;
+    RETURN_NOT_OK(VisitNullBitmapInline(
+        valid_bits, valid_bits_offset, num_values, null_count,
+        [&]() {
+          if (ARROW_PREDICT_FALSE(len_ < 4)) {
+            ParquetException::EofException();
+          }
+          auto value_len = SafeLoadAs<int32_t>(data_);
+          if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) {
+            return Status::Invalid("Invalid or corrupted value_len '", value_len, "'");
+          }
+          auto increment = value_len + 4;
+          if (ARROW_PREDICT_FALSE(len_ < increment)) {
+            ParquetException::EofException();
+          }
+          if (ARROW_PREDICT_FALSE(!helper.CanFit(value_len))) {
+            // This element would exceed the capacity of a chunk
+            RETURN_NOT_OK(helper.PushChunk());
+            RETURN_NOT_OK(helper.builder->Reserve(num_values - i));
+            RETURN_NOT_OK(helper.builder->ReserveData(
+                std::min<int64_t>(len_, helper.chunk_space_remaining)));
+          }
+          helper.UnsafeAppend(data_ + 4, value_len);
+          data_ += increment;
+          len_ -= increment;
+          ++values_decoded;
+          ++i;
+          return Status::OK();
+        },
+        [&]() {
+          helper.UnsafeAppendNull();
+          ++i;
+          return Status::OK();
+        }));
+
+    num_values_ -= values_decoded;
+    *out_values_decoded = values_decoded;
+    return Status::OK();
+  }
+
+  template <typename BuilderType>
+  Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                     int64_t valid_bits_offset, BuilderType* builder,
+                     int* out_values_decoded) {
+    RETURN_NOT_OK(builder->Reserve(num_values));
+    int values_decoded = 0;
+
+    RETURN_NOT_OK(VisitNullBitmapInline(
+        valid_bits, valid_bits_offset, num_values, null_count,
+        [&]() {
+          if (ARROW_PREDICT_FALSE(len_ < 4)) {
+            ParquetException::EofException();
+          }
+          auto value_len = SafeLoadAs<int32_t>(data_);
+          if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) {
+            return Status::Invalid("Invalid or corrupted value_len '", value_len, "'");
+          }
+          auto increment = value_len + 4;
+          if (ARROW_PREDICT_FALSE(len_ < increment)) {
+            ParquetException::EofException();
+          }
+          RETURN_NOT_OK(builder->Append(data_ + 4, value_len));
+          data_ += increment;
+          len_ -= increment;
+          ++values_decoded;
+          return Status::OK();
+        },
+        [&]() { return builder->AppendNull(); }));
+
+    num_values_ -= values_decoded;
+    *out_values_decoded = values_decoded;
+    return Status::OK();
+  }
+};
+
 class PlainFLBADecoder : public PlainDecoder<FLBAType>, virtual public FLBADecoder {
  public:
   using Base = PlainDecoder<FLBAType>;
@@ -1677,6 +1846,36 @@ void DictDecoderImpl<ByteArrayType>::SetDict(TypedDecoder<ByteArrayType>* dictio
   bytes_offsets[dictionary_length_] = offset;
 }
 
+template <>
+void DictDecoderImpl<LargeByteArrayType>::SetDict(TypedDecoder<LargeByteArrayType>* dictionary) {
+  [[maybe_unused]] auto z = dictionary->values_left();
+  DecodeDict(dictionary);
+
+  auto dict_values = reinterpret_cast<LargeByteArray*>(dictionary_->mutable_data());
+
+  uint64_t total_size = 0;
+  for (int i = 0; i < dictionary_length_; ++i) {
+    total_size += dict_values[i].len;
+  }
+  PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,
+                                                /*shrink_to_fit=*/false));
+  PARQUET_THROW_NOT_OK(
+      byte_array_offsets_->Resize((dictionary_length_ + 1) * sizeof(int64_t),
+                                  /*shrink_to_fit=*/false));
+
+  int64_t offset = 0;
+  uint8_t* bytes_data = byte_array_data_->mutable_data();
+  int64_t* bytes_offsets =
+      reinterpret_cast<int64_t*>(byte_array_offsets_->mutable_data());
+  for (int i = 0; i < dictionary_length_; ++i) {
+    memcpy(bytes_data + offset, dict_values[i].ptr, dict_values[i].len);
+    bytes_offsets[i] = offset;
+    dict_values[i].ptr = bytes_data + offset;
+    offset += dict_values[i].len;
+  }
+  bytes_offsets[dictionary_length_] = offset;
+}
+
 template <>
 inline void DictDecoderImpl<FLBAType>::SetDict(TypedDecoder<FLBAType>* dictionary) {
   DecodeDict(dictionary);
@@ -1723,6 +1922,20 @@ inline int DictDecoderImpl<ByteArrayType>::DecodeArrow(
   ParquetException::NYI("DecodeArrow implemented elsewhere");
 }
 
+template <>
+inline int DictDecoderImpl<LargeByteArrayType>::DecodeArrow(
+    int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+    typename EncodingTraits<LargeByteArrayType>::Accumulator* builder) {
+  ParquetException::NYI("DecodeArrow implemented elsewhere");
+}
+
+template <>
+inline int DictDecoderImpl<LargeByteArrayType>::DecodeArrow(
+    int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+    typename EncodingTraits<LargeByteArrayType>::DictAccumulator* builder) {
+  ParquetException::NYI("DecodeArrow implemented elsewhere");
+}
+
 template <typename DType>
 int DictDecoderImpl<DType>::DecodeArrow(
     int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
@@ -1854,6 +2067,16 @@ void DictDecoderImpl<ByteArrayType>::InsertDictionary(::arrow::ArrayBuilder* bui
   PARQUET_THROW_NOT_OK(binary_builder->InsertMemoValues(*arr));
 }
 
+template <>
+void DictDecoderImpl<LargeByteArrayType>::InsertDictionary(::arrow::ArrayBuilder* builder) {
+  auto binary_builder = checked_cast<::arrow::BinaryDictionary64Builder*>(builder);
+
+  // Make a BinaryArray referencing the internal dictionary data
+  auto arr = std::make_shared<::arrow::LargeBinaryArray>(
+      dictionary_length_, byte_array_offsets_, byte_array_data_);
+  PARQUET_THROW_NOT_OK(binary_builder->InsertMemoValues(*arr));
+}
+
 class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
                                  virtual public ByteArrayDecoder {
  public:
@@ -1862,7 +2085,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
 
   int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
                   int64_t valid_bits_offset,
-                  ::arrow::BinaryDictionary64Builder* builder) override {
+                  ::arrow::BinaryDictionary32Builder* builder) override {
     int result = 0;
     if (null_count == 0) {
       PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result));
@@ -2068,6 +2291,220 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
   }
 };
 
+class DictLargeByteArrayDecoderImpl : public DictDecoderImpl<LargeByteArrayType>,
+                                 virtual public LargeByteArrayDecoder {
+ public:
+  using BASE = DictDecoderImpl<LargeByteArrayType>;
+  using BASE::DictDecoderImpl;
+
+  int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                  int64_t valid_bits_offset,
+                  ::arrow::BinaryDictionary64Builder* builder) override {
+    int result = 0;
+    if (null_count == 0) {
+      PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result));
+    } else {
+      PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
+                                       valid_bits_offset, builder, &result));
+    }
+    return result;
+  }
+
+  int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                  int64_t valid_bits_offset,
+                  typename EncodingTraits<LargeByteArrayType>::Accumulator* out) override {
+    int result = 0;
+    if (null_count == 0) {
+      PARQUET_THROW_NOT_OK(DecodeArrowDenseNonNull(num_values, out, &result));
+    } else {
+      PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits,
+                                            valid_bits_offset, out, &result));
+    }
+    return result;
+  }
+
+ private:
+  Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits,
+                          int64_t valid_bits_offset,
+                          typename EncodingTraits<LargeByteArrayType>::Accumulator* out,
+                          int* out_num_values) {
+    constexpr int32_t kBufferSize = 1024;
+    int32_t indices[kBufferSize];
+
+    ArrowLargeBinaryHelper helper(out);
+
+    auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
+    int values_decoded = 0;
+    int num_indices = 0;
+    int pos_indices = 0;
+
+    auto visit_valid = [&](int64_t position) -> Status {
+      if (num_indices == pos_indices) {
+        // Refill indices buffer
+        const auto batch_size =
+            std::min<int32_t>(kBufferSize, num_values - null_count - values_decoded);
+        num_indices = idx_decoder_.GetBatch(indices, batch_size);
+        if (ARROW_PREDICT_FALSE(num_indices < 1)) {
+          return Status::Invalid("Invalid number of indices: ", num_indices);
+        }
+        pos_indices = 0;
+      }
+      const auto index = indices[pos_indices++];
+      RETURN_NOT_OK(IndexInBounds(index));
+      const auto& val = dict_values[index];
+      if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) {
+        RETURN_NOT_OK(helper.PushChunk());
+      }
+      RETURN_NOT_OK(helper.Append(val.ptr, static_cast<int32_t>(val.len)));
+      ++values_decoded;
+      return Status::OK();
+    };
+
+    auto visit_null = [&]() -> Status {
+      RETURN_NOT_OK(helper.AppendNull());
+      return Status::OK();
+    };
+
+    ::arrow::internal::BitBlockCounter bit_blocks(valid_bits, valid_bits_offset,
+                                                  num_values);
+    int64_t position = 0;
+    while (position < num_values) {
+      const auto block = bit_blocks.NextWord();
+      if (block.AllSet()) {
+        for (int64_t i = 0; i < block.length; ++i, ++position) {
+          ARROW_RETURN_NOT_OK(visit_valid(position));
+        }
+      } else if (block.NoneSet()) {
+        for (int64_t i = 0; i < block.length; ++i, ++position) {
+          ARROW_RETURN_NOT_OK(visit_null());
+        }
+      } else {
+        for (int64_t i = 0; i < block.length; ++i, ++position) {
+          if (bit_util::GetBit(valid_bits, valid_bits_offset + position)) {
+            ARROW_RETURN_NOT_OK(visit_valid(position));
+          } else {
+            ARROW_RETURN_NOT_OK(visit_null());
+          }
+        }
+      }
+    }
+
+    *out_num_values = values_decoded;
+    return Status::OK();
+  }
+
+  Status DecodeArrowDenseNonNull(int num_values,
+                                 typename EncodingTraits<LargeByteArrayType>::Accumulator* out,
+                                 int* out_num_values) {
+    constexpr int32_t kBufferSize = 2048;
+    int32_t indices[kBufferSize];
+    int values_decoded = 0;
+
+    ArrowLargeBinaryHelper helper(out);
+    auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
+
+    while (values_decoded < num_values) {
+      int32_t batch_size = std::min<int32_t>(kBufferSize, num_values - values_decoded);
+      int num_indices = idx_decoder_.GetBatch(indices, batch_size);
+      if (num_indices == 0) ParquetException::EofException();
+      for (int i = 0; i < num_indices; ++i) {
+        auto idx = indices[i];
+        RETURN_NOT_OK(IndexInBounds(idx));
+        const auto& val = dict_values[idx];
+        if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) {
+          RETURN_NOT_OK(helper.PushChunk());
+        }
+        RETURN_NOT_OK(helper.Append(val.ptr, static_cast<int32_t>(val.len)));
+      }
+      values_decoded += num_indices;
+    }
+    *out_num_values = values_decoded;
+    return Status::OK();
+  }
+
+  template <typename BuilderType>
+  Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                     int64_t valid_bits_offset, BuilderType* builder,
+                     int* out_num_values) {
+    constexpr int32_t kBufferSize = 1024;
+    int32_t indices[kBufferSize];
+
+    RETURN_NOT_OK(builder->Reserve(num_values));
+    ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
+
+    auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
+
+    int values_decoded = 0;
+    int num_appended = 0;
+    while (num_appended < num_values) {
+      bool is_valid = bit_reader.IsSet();
+      bit_reader.Next();
+
+      if (is_valid) {
+        int32_t batch_size =
+            std::min<int32_t>(kBufferSize, num_values - num_appended - null_count);
+        int num_indices = idx_decoder_.GetBatch(indices, batch_size);
+
+        int i = 0;
+        while (true) {
+          // Consume all indices
+          if (is_valid) {
+            auto idx = indices[i];
+            RETURN_NOT_OK(IndexInBounds(idx));
+            const auto& val = dict_values[idx];
+            RETURN_NOT_OK(builder->Append(val.ptr, val.len));
+            ++i;
+            ++values_decoded;
+          } else {
+            RETURN_NOT_OK(builder->AppendNull());
+            --null_count;
+          }
+          ++num_appended;
+          if (i == num_indices) {
+            // Do not advance the bit_reader if we have fulfilled the decode
+            // request
+            break;
+          }
+          is_valid = bit_reader.IsSet();
+          bit_reader.Next();
+        }
+      } else {
+        RETURN_NOT_OK(builder->AppendNull());
+        --null_count;
+        ++num_appended;
+      }
+    }
+    *out_num_values = values_decoded;
+    return Status::OK();
+  }
+
+  template <typename BuilderType>
+  Status DecodeArrowNonNull(int num_values, BuilderType* builder, int* out_num_values) {
+    constexpr int32_t kBufferSize = 2048;
+    int32_t indices[kBufferSize];
+
+    RETURN_NOT_OK(builder->Reserve(num_values));
+
+    auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
+
+    int values_decoded = 0;
+    while (values_decoded < num_values) {
+      int32_t batch_size = std::min<int32_t>(kBufferSize, num_values - values_decoded);
+      int num_indices = idx_decoder_.GetBatch(indices, batch_size);
+      if (num_indices == 0) ParquetException::EofException();
+      for (int i = 0; i < num_indices; ++i) {
+        auto idx = indices[i];
+        RETURN_NOT_OK(IndexInBounds(idx));
+        const auto& val = dict_values[idx];
+        RETURN_NOT_OK(builder->Append(val.ptr, val.len));
+      }
+      values_decoded += num_indices;
+    }
+    *out_num_values = values_decoded;
+    return Status::OK();
+  }
+};
+
 // ----------------------------------------------------------------------
 // DeltaBitPackEncoder
 
@@ -3439,6 +3876,8 @@ std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encodin
         return std::make_unique<PlainDecoder<DoubleType>>(descr);
       case Type::BYTE_ARRAY:
         return std::make_unique<PlainByteArrayDecoder>(descr);
+      case Type::LARGE_BYTE_ARRAY:
+        return std::make_unique<PlainLargeByteArrayDecoder>(descr);
       case Type::FIXED_LEN_BYTE_ARRAY:
         return std::make_unique<PlainFLBADecoder>(descr);
       default:
@@ -3504,6 +3943,8 @@ std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
       return std::make_unique<DictDecoderImpl<DoubleType>>(descr, pool);
     case Type::BYTE_ARRAY:
       return std::make_unique<DictByteArrayDecoderImpl>(descr, pool);
+    case Type::LARGE_BYTE_ARRAY:
+      return std::make_unique<DictLargeByteArrayDecoderImpl>(descr, pool);
     case Type::FIXED_LEN_BYTE_ARRAY:
       return std::make_unique<DictDecoderImpl<FLBAType>>(descr, pool);
     default:
diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h
index ab80284e6f83b..30345852912e8 100644
--- a/cpp/src/parquet/encoding.h
+++ b/cpp/src/parquet/encoding.h
@@ -62,6 +62,7 @@ using Int96Encoder = TypedEncoder<Int96Type>;
 using FloatEncoder = TypedEncoder<FloatType>;
 using DoubleEncoder = TypedEncoder<DoubleType>;
 using ByteArrayEncoder = TypedEncoder<ByteArrayType>;
+using LargeByteArrayEncoder = TypedEncoder<LargeByteArrayType>;
 using FLBAEncoder = TypedEncoder<FLBAType>;
 
 template <typename DType>
@@ -74,6 +75,7 @@ using Int96Decoder = TypedDecoder<Int96Type>;
 using FloatDecoder = TypedDecoder<FloatType>;
 using DoubleDecoder = TypedDecoder<DoubleType>;
 using ByteArrayDecoder = TypedDecoder<ByteArrayType>;
+using LargeByteArrayDecoder = TypedDecoder<LargeByteArrayType>;
 class FLBADecoder;
 
 template <typename T>
@@ -143,6 +145,21 @@ struct EncodingTraits<ByteArrayType> {
   using Encoder = ByteArrayEncoder;
   using Decoder = ByteArrayDecoder;
 
+  /// \brief Internal helper class for decoding BYTE_ARRAY data where we can
+  /// overflow the capacity of a single arrow::BinaryArray
+  struct Accumulator {
+    std::unique_ptr<::arrow::BinaryBuilder> builder;
+    std::vector<std::shared_ptr<::arrow::Array>> chunks;
+  };
+  using ArrowType = ::arrow::BinaryType;
+  using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>;
+};
+
+template <>
+struct EncodingTraits<LargeByteArrayType> {
+  using Encoder = LargeByteArrayEncoder;
+  using Decoder = LargeByteArrayDecoder;
+
   /// \brief Internal helper class for decoding BYTE_ARRAY data where we can
   /// overflow the capacity of a single arrow::BinaryArray
   struct Accumulator {
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index 0a9864de6266a..4da43ffe91b23 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -116,6 +116,12 @@ class PARQUET_EXPORT ReaderProperties {
     page_checksum_verification_ = check_crc;
   }
 
+  bool use_binary_large_variants() const { return use_binary_large_variants_; }
+
+  void set_use_binary_large_variants(bool use_binary_large_variants) {
+    use_binary_large_variants_ = use_binary_large_variants;
+  }
+
  private:
   MemoryPool* pool_;
   int64_t buffer_size_ = kDefaultBufferSize;
@@ -124,6 +130,7 @@ class PARQUET_EXPORT ReaderProperties {
   bool buffered_stream_enabled_ = false;
   bool page_checksum_verification_ = false;
   std::shared_ptr<FileDecryptionProperties> file_decryption_properties_;
+  bool use_binary_large_variants_ = false;
 };
 
 ReaderProperties PARQUET_EXPORT default_reader_properties();
@@ -851,6 +858,14 @@ class PARQUET_EXPORT ArrowReaderProperties {
     return coerce_int96_timestamp_unit_;
   }
 
+  void set_use_binary_large_variants(bool use_binary_large_variants) {
+    use_binary_large_variants_ = use_binary_large_variants;
+  }
+
+  bool use_binary_large_variants() const {
+    return use_binary_large_variants_;
+  }
+
  private:
   bool use_threads_;
   std::unordered_set<int> read_dict_indices_;
@@ -859,6 +874,7 @@ class PARQUET_EXPORT ArrowReaderProperties {
   ::arrow::io::IOContext io_context_;
   ::arrow::io::CacheOptions cache_options_;
   ::arrow::TimeUnit::type coerce_int96_timestamp_unit_;
+  bool use_binary_large_variants_;
 };
 
 /// EXPERIMENTAL: Constructs the default ArrowReaderProperties
diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h
index d4d6a73f147fc..3edabf6d311c2 100644
--- a/cpp/src/parquet/types.h
+++ b/cpp/src/parquet/types.h
@@ -64,6 +64,9 @@ struct Type {
     DOUBLE = 5,
     BYTE_ARRAY = 6,
     FIXED_LEN_BYTE_ARRAY = 7,
+
+    // workaround
+    LARGE_BYTE_ARRAY = 8,
     // Should always be last element.
     UNDEFINED = 8
   };
@@ -588,6 +591,26 @@ inline bool operator!=(const ByteArray& left, const ByteArray& right) {
   return !(left == right);
 }
 
+struct LargeByteArray {
+  LargeByteArray() : len(0), ptr(NULLPTR) {}
+  LargeByteArray(uint64_t len, const uint8_t* ptr) : len(len), ptr(ptr) {}
+
+  LargeByteArray(::std::string_view view)  // NOLINT implicit conversion
+      : LargeByteArray(view.size(),
+                  reinterpret_cast<const uint8_t*>(view.data())) {}
+  uint64_t len;
+  const uint8_t* ptr;
+};
+
+inline bool operator==(const LargeByteArray& left, const LargeByteArray& right) {
+  return left.len == right.len &&
+         (left.len == 0 || std::memcmp(left.ptr, right.ptr, left.len) == 0);
+}
+
+inline bool operator!=(const LargeByteArray& left, const LargeByteArray& right) {
+  return !(left == right);
+}
+
 struct FixedLenByteArray {
   FixedLenByteArray() : ptr(NULLPTR) {}
   explicit FixedLenByteArray(const uint8_t* ptr) : ptr(ptr) {}
@@ -740,6 +763,14 @@ struct type_traits<Type::BYTE_ARRAY> {
   static constexpr const char* printf_code = "s";
 };
 
+template<>
+struct type_traits<Type::LARGE_BYTE_ARRAY> {
+  using value_type = LargeByteArray;
+
+  static constexpr int value_byte_size = sizeof(LargeByteArray);
+  static constexpr const char* printf_code = "ls";
+};
+
 template <>
 struct type_traits<Type::FIXED_LEN_BYTE_ARRAY> {
   using value_type = FixedLenByteArray;
@@ -761,6 +792,7 @@ using Int96Type = PhysicalType<Type::INT96>;
 using FloatType = PhysicalType<Type::FLOAT>;
 using DoubleType = PhysicalType<Type::DOUBLE>;
 using ByteArrayType = PhysicalType<Type::BYTE_ARRAY>;
+using LargeByteArrayType = PhysicalType<Type::LARGE_BYTE_ARRAY>;
 using FLBAType = PhysicalType<Type::FIXED_LEN_BYTE_ARRAY>;
 
 template <typename Type>

From 34917d5608330cd0961e32f8997f2e513e03dae2 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Mon, 29 May 2023 15:22:21 -0300
Subject: [PATCH 04/69] still not working

---
 cpp/src/parquet/arrow/reader.cc  |  2 +-
 cpp/src/parquet/column_reader.cc |  4 ++--
 cpp/src/parquet/encoding.cc      | 11 +++++------
 cpp/src/parquet/types.h          |  4 ++--
 4 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 09b3c8c5f8fdd..8d636810e7c69 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -462,7 +462,7 @@ class LeafReader : public ColumnReaderImpl {
         input_(std::move(input)),
         descr_(input_->descr()) {
     record_reader_ = RecordReader::Make(
-        descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY, false, true /* large variants*/);
+        descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY, false, true /* use_binary_string_large_variants */);
     NextRowGroup();
   }
 
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 360c506dcff4a..e50f2262af32e 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -2142,7 +2142,7 @@ class LargeByteArrayChunkedRecordReader : public TypedRecordReader<LargeByteArra
                                ::arrow::MemoryPool* pool, bool read_dense_for_nullable)
       : TypedRecordReader<LargeByteArrayType>(descr, leaf_info, pool,
                                          read_dense_for_nullable) {
-    ARROW_DCHECK_EQ(descr_->physical_type(), Type::LARGE_BYTE_ARRAY);
+    ARROW_DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY);
     accumulator_.builder = std::make_unique<::arrow::LargeBinaryBuilder>(pool);
   }
 
@@ -2362,7 +2362,7 @@ std::shared_ptr<RecordReader> MakeByteArrayRecordReader(const ColumnDescriptor*
                                                              read_dense_for_nullable);
   } else {
     if (use_binary_string_large_variants) {
-      return std::make_shared<LargeByteArrayDictionaryRecordReader>(
+      return std::make_shared<LargeByteArrayChunkedRecordReader>(
           descr, leaf_info, pool, read_dense_for_nullable);
     }
 
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 8e350f39f1393..1aaca5b23f567 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -1848,25 +1848,24 @@ void DictDecoderImpl<ByteArrayType>::SetDict(TypedDecoder<ByteArrayType>* dictio
 
 template <>
 void DictDecoderImpl<LargeByteArrayType>::SetDict(TypedDecoder<LargeByteArrayType>* dictionary) {
-  [[maybe_unused]] auto z = dictionary->values_left();
   DecodeDict(dictionary);
 
   auto dict_values = reinterpret_cast<LargeByteArray*>(dictionary_->mutable_data());
 
-  uint64_t total_size = 0;
+  uint32_t total_size = 0;
   for (int i = 0; i < dictionary_length_; ++i) {
     total_size += dict_values[i].len;
   }
   PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,
                                                 /*shrink_to_fit=*/false));
   PARQUET_THROW_NOT_OK(
-      byte_array_offsets_->Resize((dictionary_length_ + 1) * sizeof(int64_t),
+      byte_array_offsets_->Resize((dictionary_length_ + 1) * sizeof(int32_t),
                                   /*shrink_to_fit=*/false));
 
-  int64_t offset = 0;
+  int32_t offset = 0;
   uint8_t* bytes_data = byte_array_data_->mutable_data();
-  int64_t* bytes_offsets =
-      reinterpret_cast<int64_t*>(byte_array_offsets_->mutable_data());
+  int32_t* bytes_offsets =
+      reinterpret_cast<int32_t*>(byte_array_offsets_->mutable_data());
   for (int i = 0; i < dictionary_length_; ++i) {
     memcpy(bytes_data + offset, dict_values[i].ptr, dict_values[i].len);
     bytes_offsets[i] = offset;
diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h
index 3edabf6d311c2..14091fa7dd156 100644
--- a/cpp/src/parquet/types.h
+++ b/cpp/src/parquet/types.h
@@ -593,12 +593,12 @@ inline bool operator!=(const ByteArray& left, const ByteArray& right) {
 
 struct LargeByteArray {
   LargeByteArray() : len(0), ptr(NULLPTR) {}
-  LargeByteArray(uint64_t len, const uint8_t* ptr) : len(len), ptr(ptr) {}
+  LargeByteArray(uint32_t len, const uint8_t* ptr) : len(len), ptr(ptr) {}
 
   LargeByteArray(::std::string_view view)  // NOLINT implicit conversion
       : LargeByteArray(view.size(),
                   reinterpret_cast<const uint8_t*>(view.data())) {}
-  uint64_t len;
+  uint32_t len;
   const uint8_t* ptr;
 };
 

From 835b07dc688554c62efb0c604be421d7a2174943 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Mon, 29 May 2023 17:27:51 -0300
Subject: [PATCH 05/69] able to read the file again

---
 cpp/src/arrow/type.h            |  2 +-
 cpp/src/parquet/arrow/reader.cc |  2 +-
 cpp/src/parquet/encoding.cc     | 38 +++++++++++++++++++++++++++++++--
 3 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 820312e7a0c77..57be85114187d 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -679,7 +679,7 @@ class ARROW_EXPORT BaseBinaryType : public DataType {
 
 constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max() - 1;
 
-constexpr int64_t kLargeBinaryMemoryLimit = std::numeric_limits<int32_t>::max() - 1; // 2^35
+constexpr int64_t kLargeBinaryMemoryLimit = std::numeric_limits<int64_t>::max() - 1; // 2^35
 
 /// \addtogroup binary-datatypes
 ///
diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 8d636810e7c69..8e68e1701b8d3 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -462,7 +462,7 @@ class LeafReader : public ColumnReaderImpl {
         input_(std::move(input)),
         descr_(input_->descr()) {
     record_reader_ = RecordReader::Make(
-        descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY, false, true /* use_binary_string_large_variants */);
+        descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY, false, true /* use_binary_large_variants */);
     NextRowGroup();
   }
 
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 1aaca5b23f567..3295df90d5933 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -20,6 +20,7 @@
 #include <algorithm>
 #include <cstdint>
 #include <cstdlib>
+#include <iostream>
 #include <limits>
 #include <memory>
 #include <string>
@@ -1126,6 +1127,39 @@ inline int DecodePlain<ByteArray>(const uint8_t* data, int64_t data_size, int nu
   return bytes_decoded;
 }
 
+static inline int64_t ReadLargeByteArray(const uint8_t* data, int64_t data_size,
+                                    LargeByteArray* out) {
+  if (ARROW_PREDICT_FALSE(data_size < 4)) {
+    ParquetException::EofException();
+  }
+  const int32_t len = SafeLoadAs<int32_t>(data);
+  if (len < 0) {
+    throw ParquetException("Invalid BYTE_ARRAY value");
+  }
+  const int64_t consumed_length = static_cast<int64_t>(len) + 4;
+  if (ARROW_PREDICT_FALSE(data_size < consumed_length)) {
+    ParquetException::EofException();
+  }
+  *out = LargeByteArray{static_cast<uint32_t>(len), data + 4};
+  return consumed_length;
+}
+
+template <>
+inline int DecodePlain<LargeByteArray>(const uint8_t* data, int64_t data_size, int num_values,
+                                  int type_length, LargeByteArray* out) {
+  int bytes_decoded = 0;
+  for (int i = 0; i < num_values; ++i) {
+    const auto increment = ReadLargeByteArray(data, data_size, out + i);
+    if (ARROW_PREDICT_FALSE(increment > INT_MAX - bytes_decoded)) {
+      throw ParquetException("BYTE_ARRAY chunk too large");
+    }
+    data += increment;
+    data_size -= increment;
+    bytes_decoded += static_cast<int>(increment);
+  }
+  return bytes_decoded;
+}
+
 // Template specialization for FIXED_LEN_BYTE_ARRAY. The written values do not
 // own their own data.
 template <>
@@ -1850,9 +1884,9 @@ template <>
 void DictDecoderImpl<LargeByteArrayType>::SetDict(TypedDecoder<LargeByteArrayType>* dictionary) {
   DecodeDict(dictionary);
 
-  auto dict_values = reinterpret_cast<LargeByteArray*>(dictionary_->mutable_data());
+  auto dict_values = reinterpret_cast<ByteArray*>(dictionary_->mutable_data());
 
-  uint32_t total_size = 0;
+  int total_size = 0;
   for (int i = 0; i < dictionary_length_; ++i) {
     total_size += dict_values[i].len;
   }

From 50427c6af4b6099ae864a5216fe94fb97c5e3fbe Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Tue, 30 May 2023 09:49:51 -0300
Subject: [PATCH 06/69] move use_binary_large_variants to arrowreaderproperties

---
 cpp/examples/parquet/parquet_arrow/reader_writer.cc | 10 ++++++----
 cpp/src/parquet/arrow/reader.cc                     |  3 ++-
 cpp/src/parquet/arrow/reader_internal.h             |  1 +
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/cpp/examples/parquet/parquet_arrow/reader_writer.cc b/cpp/examples/parquet/parquet_arrow/reader_writer.cc
index 8357f380d106e..eaf94c6ad7e96 100644
--- a/cpp/examples/parquet/parquet_arrow/reader_writer.cc
+++ b/cpp/examples/parquet/parquet_arrow/reader_writer.cc
@@ -71,11 +71,13 @@ void read_whole_file(const std::string & filename) {
 
   parquet::arrow::FileReaderBuilder builder;
 
-  parquet::ReaderProperties props = parquet::default_reader_properties();
+  parquet::ArrowReaderProperties properties;
 
-//  props.set_use_binary_large_variants(true);
+  properties.set_use_binary_large_variants(true);
 
-  PARQUET_THROW_NOT_OK(builder.Open(infile, props));
+  builder.properties(properties);
+
+  PARQUET_THROW_NOT_OK(builder.Open(infile));
 
   PARQUET_THROW_NOT_OK(builder.Build(&reader));
 
@@ -145,7 +147,7 @@ void read_single_column_chunk() {
 int main(int argc, char** argv) {
 //  std::shared_ptr<arrow::Table> table = generate_table();
 //  write_parquet_file(*table);
-  read_whole_file("minimal_repro.parquet");
+  read_whole_file("chunked_jira.parquet");
 //  read_single_rowgroup();
 //  read_single_column("minimal_repro.parquet");
 //  read_single_column_chunk();
diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 8e68e1701b8d3..b95be9cdd1415 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -219,6 +219,7 @@ class FileReaderImpl : public FileReader {
     ctx->iterator_factory = SomeRowGroupsFactory(row_groups);
     ctx->filter_leaves = true;
     ctx->included_leaves = included_leaves;
+    ctx->use_binary_large_variants = reader_properties_.use_binary_large_variants();
     return GetReader(manifest_.schema_fields[i], ctx, out);
   }
 
@@ -462,7 +463,7 @@ class LeafReader : public ColumnReaderImpl {
         input_(std::move(input)),
         descr_(input_->descr()) {
     record_reader_ = RecordReader::Make(
-        descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY, false, true /* use_binary_large_variants */);
+        descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY, false, ctx_->use_binary_large_variants);
     NextRowGroup();
   }
 
diff --git a/cpp/src/parquet/arrow/reader_internal.h b/cpp/src/parquet/arrow/reader_internal.h
index cf9dbb86577b5..c5ee54b7c03d4 100644
--- a/cpp/src/parquet/arrow/reader_internal.h
+++ b/cpp/src/parquet/arrow/reader_internal.h
@@ -109,6 +109,7 @@ struct ReaderContext {
   FileColumnIteratorFactory iterator_factory;
   bool filter_leaves;
   std::shared_ptr<std::unordered_set<int>> included_leaves;
+  bool use_binary_large_variants = false;
 
   bool IncludesLeaf(int leaf_index) const {
     if (this->filter_leaves) {

From df65ce7087860e047552bf3e9e73cad39e010679 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Tue, 30 May 2023 10:14:54 -0300
Subject: [PATCH 07/69] cleanup a bit

---
 cpp/examples/arrow/parquet_read_write.cc            | 2 +-
 cpp/examples/parquet/parquet_arrow/reader_writer.cc | 9 ++++-----
 cpp/src/arrow/memory_pool.cc                        | 3 ---
 cpp/src/arrow/type.h                                | 3 +--
 cpp/src/parquet/column_scanner.h                    | 8 ++++++++
 cpp/src/parquet/encoding.cc                         | 1 -
 cpp/src/parquet/types.h                             | 4 ++++
 7 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/cpp/examples/arrow/parquet_read_write.cc b/cpp/examples/arrow/parquet_read_write.cc
index 20fe2c20b291a..3b8b4c2212b75 100644
--- a/cpp/examples/arrow/parquet_read_write.cc
+++ b/cpp/examples/arrow/parquet_read_write.cc
@@ -120,7 +120,7 @@ arrow::Status WriteFullFile(std::string path_to_file) {
 
   ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(),
                                                  arrow::default_memory_pool(), outfile,
-                                                 /*chunk_size=*/1024*1024*1024, props, arrow_props));
+                                                 /*chunk_size=*/3, props, arrow_props));
   return arrow::Status::OK();
 }
 
diff --git a/cpp/examples/parquet/parquet_arrow/reader_writer.cc b/cpp/examples/parquet/parquet_arrow/reader_writer.cc
index eaf94c6ad7e96..343914c367faa 100644
--- a/cpp/examples/parquet/parquet_arrow/reader_writer.cc
+++ b/cpp/examples/parquet/parquet_arrow/reader_writer.cc
@@ -56,7 +56,7 @@ void write_parquet_file(const arrow::Table& table) {
   // the parquet file. Normally you would choose this to be rather large but
   // for the example, we use a small value to have multiple RowGroups.
   PARQUET_THROW_NOT_OK(
-      parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 1024*1024 * 1024));
+      parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 3));
 }
 
 // #2: Fully read in the file
@@ -108,18 +108,17 @@ void read_single_rowgroup() {
 }
 
 // #4: Read only a single column of the whole parquet file
-void read_single_column(const std::string & filename) {
-  std::cout << "Reading first column of " << filename << std::endl;
+void read_single_column() {
   std::shared_ptr<arrow::io::ReadableFile> infile;
   PARQUET_ASSIGN_OR_THROW(infile,
-                          arrow::io::ReadableFile::Open(filename,
+                          arrow::io::ReadableFile::Open("parquet-arrow-example.parquet",
                                                         arrow::default_memory_pool()));
 
   std::unique_ptr<parquet::arrow::FileReader> reader;
   PARQUET_THROW_NOT_OK(
       parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
   std::shared_ptr<arrow::ChunkedArray> array;
-  PARQUET_THROW_NOT_OK(reader->ReadColumn(5, &array));
+  PARQUET_THROW_NOT_OK(reader->ReadColumn(0, &array));
   PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout));
   std::cout << std::endl;
 }
diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc
index dd14953f7ff47..843329c17bc28 100644
--- a/cpp/src/arrow/memory_pool.cc
+++ b/cpp/src/arrow/memory_pool.cc
@@ -888,9 +888,6 @@ class PoolBuffer final : public ResizableBuffer {
         capacity_ = new_capacity;
       }
     } else {
-      if (new_size > static_cast<int64_t>(pow(2, 59))) {
-        assert(false);
-      }
       RETURN_NOT_OK(Reserve(new_size));
     }
     size_ = new_size;
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 57be85114187d..64d2893a9725c 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -19,7 +19,6 @@
 
 #include <atomic>
 #include <climits>
-#include <cmath>
 #include <cstdint>
 #include <iosfwd>
 #include <limits>
@@ -679,7 +678,7 @@ class ARROW_EXPORT BaseBinaryType : public DataType {
 
 constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max() - 1;
 
-constexpr int64_t kLargeBinaryMemoryLimit = std::numeric_limits<int64_t>::max() - 1; // 2^35
+constexpr int64_t kLargeBinaryMemoryLimit = std::numeric_limits<int64_t>::max() - 1;
 
 /// \addtogroup binary-datatypes
 ///
diff --git a/cpp/src/parquet/column_scanner.h b/cpp/src/parquet/column_scanner.h
index d53435f03cd32..7bea4ca24d6db 100644
--- a/cpp/src/parquet/column_scanner.h
+++ b/cpp/src/parquet/column_scanner.h
@@ -225,6 +225,14 @@ inline void TypedScanner<ByteArrayType>::FormatValue(void* val, char* buffer, in
   snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
 }
 
+template <>
+inline void TypedScanner<LargeByteArrayType>::FormatValue(void* val, char* buffer, int bufsize,
+                                                     int width) {
+  std::string fmt = format_fwf<LargeByteArrayType>(width);
+  std::string result = LargeByteArrayToString(*reinterpret_cast<LargeByteArray*>(val));
+  snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
+}
+
 template <>
 inline void TypedScanner<FLBAType>::FormatValue(void* val, char* buffer, int bufsize,
                                                 int width) {
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 3295df90d5933..f124db0736875 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -20,7 +20,6 @@
 #include <algorithm>
 #include <cstdint>
 #include <cstdlib>
-#include <iostream>
 #include <limits>
 #include <memory>
 #include <string>
diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h
index 14091fa7dd156..226d0c5730a7b 100644
--- a/cpp/src/parquet/types.h
+++ b/cpp/src/parquet/types.h
@@ -645,6 +645,10 @@ static inline std::string ByteArrayToString(const ByteArray& a) {
   return std::string(reinterpret_cast<const char*>(a.ptr), a.len);
 }
 
+static inline std::string LargeByteArrayToString(const LargeByteArray& a) {
+  return std::string(reinterpret_cast<const char*>(a.ptr), a.len);
+}
+
 static inline void Int96SetNanoSeconds(parquet::Int96& i96, int64_t nanoseconds) {
   std::memcpy(&i96.value, &nanoseconds, sizeof(nanoseconds));
 }

From e826b8e82de3ea361599f1e9b600861fc503d227 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Tue, 30 May 2023 11:08:06 -0300
Subject: [PATCH 08/69] back fromByteArray string & binary with setting

---
 .../parquet/parquet_arrow/reader_writer.cc        | 12 ++++++------
 cpp/src/parquet/arrow/schema.cc                   |  2 +-
 cpp/src/parquet/arrow/schema_internal.cc          | 15 ++++++++-------
 cpp/src/parquet/arrow/schema_internal.h           | 11 +++++++----
 4 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/cpp/examples/parquet/parquet_arrow/reader_writer.cc b/cpp/examples/parquet/parquet_arrow/reader_writer.cc
index 343914c367faa..6468edd67534d 100644
--- a/cpp/examples/parquet/parquet_arrow/reader_writer.cc
+++ b/cpp/examples/parquet/parquet_arrow/reader_writer.cc
@@ -84,6 +84,10 @@ void read_whole_file(const std::string & filename) {
 //  PARQUET_THROW_NOT_OK(
 //      parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
 
+  std::shared_ptr<::arrow::Schema> schema;
+
+  [[maybe_unused]] auto metadata = reader->GetSchema(&schema);
+
   std::shared_ptr<arrow::Table> table;
   PARQUET_THROW_NOT_OK(reader->ReadTable(&table));
   std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns()
@@ -144,10 +148,6 @@ void read_single_column_chunk() {
 }
 
 int main(int argc, char** argv) {
-//  std::shared_ptr<arrow::Table> table = generate_table();
-//  write_parquet_file(*table);
-  read_whole_file("chunked_jira.parquet");
-//  read_single_rowgroup();
-//  read_single_column("minimal_repro.parquet");
-//  read_single_column_chunk();
+//  read_whole_file("chunked_jira.parquet");
+  read_whole_file("minimal_repro.parquet");
 }
diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
index c5d5e0743a7f1..4920bad21f0df 100644
--- a/cpp/src/parquet/arrow/schema.cc
+++ b/cpp/src/parquet/arrow/schema.cc
@@ -473,7 +473,7 @@ ::arrow::Result<std::shared_ptr<ArrowType>> GetTypeForNode(
     SchemaTreeContext* ctx) {
   ASSIGN_OR_RAISE(
       std::shared_ptr<ArrowType> storage_type,
-      GetArrowType(primitive_node, ctx->properties.coerce_int96_timestamp_unit()));
+      GetArrowType(primitive_node, ctx->properties.coerce_int96_timestamp_unit(), ctx->properties.use_binary_large_variants()));
   if (ctx->properties.read_dictionary(column_index) &&
       IsDictionaryReadSupported(*storage_type)) {
     return ::arrow::dictionary(::arrow::int32(), storage_type);
diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc
index dbff14d93b84e..b9f6bea7e0ecb 100644
--- a/cpp/src/parquet/arrow/schema_internal.cc
+++ b/cpp/src/parquet/arrow/schema_internal.cc
@@ -110,17 +110,17 @@ Result<std::shared_ptr<ArrowType>> MakeArrowTimestamp(const LogicalType& logical
   }
 }
 
-Result<std::shared_ptr<ArrowType>> FromByteArray(const LogicalType& logical_type) {
+Result<std::shared_ptr<ArrowType>> FromByteArray(const LogicalType& logical_type, bool use_binary_large_variant) {
   switch (logical_type.type()) {
     case LogicalType::Type::STRING:
-      return ::arrow::large_utf8();
+      return use_binary_large_variant ? ::arrow::large_utf8() : ::arrow::utf8();
     case LogicalType::Type::DECIMAL:
       return MakeArrowDecimal(logical_type);
     case LogicalType::Type::NONE:
     case LogicalType::Type::ENUM:
     case LogicalType::Type::JSON:
     case LogicalType::Type::BSON:
-      return ::arrow::binary();
+      return use_binary_large_variant ? ::arrow::large_binary() : ::arrow::binary();
     default:
       return Status::NotImplemented("Unhandled logical logical_type ",
                                     logical_type.ToString(), " for binary array");
@@ -181,7 +181,7 @@ Result<std::shared_ptr<ArrowType>> FromInt64(const LogicalType& logical_type) {
 
 Result<std::shared_ptr<ArrowType>> GetArrowType(
     Type::type physical_type, const LogicalType& logical_type, int type_length,
-    const ::arrow::TimeUnit::type int96_arrow_time_unit) {
+    const ::arrow::TimeUnit::type int96_arrow_time_unit, bool use_binary_large_variant) {
   if (logical_type.is_invalid() || logical_type.is_null()) {
     return ::arrow::null();
   }
@@ -200,7 +200,7 @@ Result<std::shared_ptr<ArrowType>> GetArrowType(
     case ParquetType::DOUBLE:
       return ::arrow::float64();
     case ParquetType::BYTE_ARRAY:
-      return FromByteArray(logical_type);
+      return FromByteArray(logical_type, use_binary_large_variant);
     case ParquetType::FIXED_LEN_BYTE_ARRAY:
       return FromFLBA(logical_type, type_length);
     default: {
@@ -213,9 +213,10 @@ Result<std::shared_ptr<ArrowType>> GetArrowType(
 
 Result<std::shared_ptr<ArrowType>> GetArrowType(
     const schema::PrimitiveNode& primitive,
-    const ::arrow::TimeUnit::type int96_arrow_time_unit) {
+    const ::arrow::TimeUnit::type int96_arrow_time_unit,
+    bool use_binary_large_variant) {
   return GetArrowType(primitive.physical_type(), *primitive.logical_type(),
-                      primitive.type_length(), int96_arrow_time_unit);
+                      primitive.type_length(), int96_arrow_time_unit, use_binary_large_variant);
 }
 
 }  // namespace arrow
diff --git a/cpp/src/parquet/arrow/schema_internal.h b/cpp/src/parquet/arrow/schema_internal.h
index fb837c3ee6cab..e17f2d2d07c5b 100644
--- a/cpp/src/parquet/arrow/schema_internal.h
+++ b/cpp/src/parquet/arrow/schema_internal.h
@@ -29,7 +29,7 @@ namespace arrow {
 
 using ::arrow::Result;
 
-Result<std::shared_ptr<::arrow::DataType>> FromByteArray(const LogicalType& logical_type);
+Result<std::shared_ptr<::arrow::DataType>> FromByteArray(const LogicalType& logical_type, bool use_binary_large_variant = false);
 Result<std::shared_ptr<::arrow::DataType>> FromFLBA(const LogicalType& logical_type,
                                                     int32_t physical_length);
 Result<std::shared_ptr<::arrow::DataType>> FromInt32(const LogicalType& logical_type);
@@ -37,15 +37,18 @@ Result<std::shared_ptr<::arrow::DataType>> FromInt64(const LogicalType& logical_
 
 Result<std::shared_ptr<::arrow::DataType>> GetArrowType(Type::type physical_type,
                                                         const LogicalType& logical_type,
-                                                        int type_length);
+                                                        int type_length,
+                                                        bool use_binary_large_variant = false);
 
 Result<std::shared_ptr<::arrow::DataType>> GetArrowType(
     Type::type physical_type, const LogicalType& logical_type, int type_length,
-    ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO);
+    ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO,
+    bool use_binary_large_variant = false);
 
 Result<std::shared_ptr<::arrow::DataType>> GetArrowType(
     const schema::PrimitiveNode& primitive,
-    ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO);
+    ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO,
+    bool use_binary_large_variant = false);
 
 }  // namespace arrow
 }  // namespace parquet

From 764ef98a0cf4fb94416e1fb732e60219a13c84af Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Tue, 30 May 2023 13:40:39 -0300
Subject: [PATCH 09/69] some more adjustments

---
 cpp/src/parquet/arrow/schema_internal.cc | 25 ++++++++++++++++++++----
 cpp/src/parquet/arrow/schema_internal.h  |  4 +++-
 cpp/src/parquet/column_reader.h          |  1 +
 cpp/src/parquet/column_writer.h          |  1 +
 cpp/src/parquet/metadata.cc              |  2 ++
 cpp/src/parquet/page_index.cc            |  4 ++++
 cpp/src/parquet/stream_reader.cc         |  3 +++
 cpp/src/parquet/stream_writer.cc         |  4 ++++
 cpp/src/parquet/types.cc                 |  1 +
 cpp/src/parquet/types.h                  |  5 +++--
 10 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc
index b9f6bea7e0ecb..a256ec4a6d7f9 100644
--- a/cpp/src/parquet/arrow/schema_internal.cc
+++ b/cpp/src/parquet/arrow/schema_internal.cc
@@ -110,17 +110,34 @@ Result<std::shared_ptr<ArrowType>> MakeArrowTimestamp(const LogicalType& logical
   }
 }
 
-Result<std::shared_ptr<ArrowType>> FromByteArray(const LogicalType& logical_type, bool use_binary_large_variant) {
+Result<std::shared_ptr<ArrowType>> FromByteArray(const LogicalType& logical_type) {
   switch (logical_type.type()) {
     case LogicalType::Type::STRING:
-      return use_binary_large_variant ? ::arrow::large_utf8() : ::arrow::utf8();
+      return ::arrow::utf8();
     case LogicalType::Type::DECIMAL:
       return MakeArrowDecimal(logical_type);
     case LogicalType::Type::NONE:
     case LogicalType::Type::ENUM:
     case LogicalType::Type::JSON:
     case LogicalType::Type::BSON:
-      return use_binary_large_variant ? ::arrow::large_binary() : ::arrow::binary();
+      return ::arrow::binary();
+    default:
+      return Status::NotImplemented("Unhandled logical logical_type ",
+                                    logical_type.ToString(), " for binary array");
+  }
+}
+
+Result<std::shared_ptr<ArrowType>> FromLargeByteArray(const LogicalType& logical_type) {
+  switch (logical_type.type()) {
+    case LogicalType::Type::STRING:
+      return ::arrow::large_utf8();
+    case LogicalType::Type::DECIMAL:
+      return MakeArrowDecimal(logical_type);
+    case LogicalType::Type::NONE:
+    case LogicalType::Type::ENUM:
+    case LogicalType::Type::JSON:
+    case LogicalType::Type::BSON:
+      return ::arrow::large_binary();
     default:
       return Status::NotImplemented("Unhandled logical logical_type ",
                                     logical_type.ToString(), " for binary array");
@@ -200,7 +217,7 @@ Result<std::shared_ptr<ArrowType>> GetArrowType(
     case ParquetType::DOUBLE:
       return ::arrow::float64();
     case ParquetType::BYTE_ARRAY:
-      return FromByteArray(logical_type, use_binary_large_variant);
+      return use_binary_large_variant ? FromLargeByteArray(logical_type) : FromByteArray(logical_type);
     case ParquetType::FIXED_LEN_BYTE_ARRAY:
       return FromFLBA(logical_type, type_length);
     default: {
diff --git a/cpp/src/parquet/arrow/schema_internal.h b/cpp/src/parquet/arrow/schema_internal.h
index e17f2d2d07c5b..d27440ea22301 100644
--- a/cpp/src/parquet/arrow/schema_internal.h
+++ b/cpp/src/parquet/arrow/schema_internal.h
@@ -29,7 +29,9 @@ namespace arrow {
 
 using ::arrow::Result;
 
-Result<std::shared_ptr<::arrow::DataType>> FromByteArray(const LogicalType& logical_type, bool use_binary_large_variant = false);
+Result<std::shared_ptr<::arrow::DataType>> FromByteArray(const LogicalType& logical_type);
+Result<std::shared_ptr<::arrow::DataType>> FromLargeByteArray(const LogicalType& logical_type);
+
 Result<std::shared_ptr<::arrow::DataType>> FromFLBA(const LogicalType& logical_type,
                                                     int32_t physical_length);
 Result<std::shared_ptr<::arrow::DataType>> FromInt32(const LogicalType& logical_type);
diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index 94f42e7db6563..2c6dfea9d39a1 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -492,6 +492,7 @@ using Int96Reader = TypedColumnReader<Int96Type>;
 using FloatReader = TypedColumnReader<FloatType>;
 using DoubleReader = TypedColumnReader<DoubleType>;
 using ByteArrayReader = TypedColumnReader<ByteArrayType>;
+using LargeByteArrayReader = TypedColumnReader<LargeByteArrayType>;
 using FixedLenByteArrayReader = TypedColumnReader<FLBAType>;
 
 }  // namespace parquet
diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h
index 792b108ac8835..545ecbb6732f8 100644
--- a/cpp/src/parquet/column_writer.h
+++ b/cpp/src/parquet/column_writer.h
@@ -233,6 +233,7 @@ using Int96Writer = TypedColumnWriter<Int96Type>;
 using FloatWriter = TypedColumnWriter<FloatType>;
 using DoubleWriter = TypedColumnWriter<DoubleType>;
 using ByteArrayWriter = TypedColumnWriter<ByteArrayType>;
+using LargeByteArrayWriter = TypedColumnWriter<LargeByteArrayType>;
 using FixedLenByteArrayWriter = TypedColumnWriter<FLBAType>;
 
 namespace internal {
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index 0bbd96580774a..055e679a9b685 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -123,6 +123,8 @@ std::shared_ptr<Statistics> MakeColumnStats(const format::ColumnMetaData& meta_d
       return MakeTypedColumnStats<FloatType>(meta_data, descr);
     case Type::BYTE_ARRAY:
       return MakeTypedColumnStats<ByteArrayType>(meta_data, descr);
+    case Type::LARGE_BYTE_ARRAY:
+      return MakeTypedColumnStats<LargeByteArrayType>(meta_data, descr);
     case Type::FIXED_LEN_BYTE_ARRAY:
       return MakeTypedColumnStats<FLBAType>(meta_data, descr);
     case Type::UNDEFINED:
diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc
index d29cc33eb5afd..969db469bbeb5 100644
--- a/cpp/src/parquet/page_index.cc
+++ b/cpp/src/parquet/page_index.cc
@@ -853,6 +853,8 @@ std::unique_ptr<ColumnIndex> ColumnIndex::Make(const ColumnDescriptor& descr,
       return std::make_unique<TypedColumnIndexImpl<DoubleType>>(descr, column_index);
     case Type::BYTE_ARRAY:
       return std::make_unique<TypedColumnIndexImpl<ByteArrayType>>(descr, column_index);
+    case Type::LARGE_BYTE_ARRAY:
+      return std::make_unique<TypedColumnIndexImpl<LargeByteArrayType>>(descr, column_index);
     case Type::FIXED_LEN_BYTE_ARRAY:
       return std::make_unique<TypedColumnIndexImpl<FLBAType>>(descr, column_index);
     case Type::UNDEFINED:
@@ -897,6 +899,8 @@ std::unique_ptr<ColumnIndexBuilder> ColumnIndexBuilder::Make(
       return std::make_unique<ColumnIndexBuilderImpl<DoubleType>>(descr);
     case Type::BYTE_ARRAY:
       return std::make_unique<ColumnIndexBuilderImpl<ByteArrayType>>(descr);
+    case Type::LARGE_BYTE_ARRAY:
+      return std::make_unique<ColumnIndexBuilderImpl<LargeByteArrayType>>(descr);
     case Type::FIXED_LEN_BYTE_ARRAY:
       return std::make_unique<ColumnIndexBuilderImpl<FLBAType>>(descr);
     case Type::UNDEFINED:
diff --git a/cpp/src/parquet/stream_reader.cc b/cpp/src/parquet/stream_reader.cc
index 0fecb1bf24615..66bcf5ca97560 100644
--- a/cpp/src/parquet/stream_reader.cc
+++ b/cpp/src/parquet/stream_reader.cc
@@ -488,6 +488,9 @@ void StreamReader::SkipRowsInColumn(ColumnReader* reader, int64_t num_rows_to_sk
     case Type::BYTE_ARRAY:
       num_skipped = static_cast<ByteArrayReader*>(reader)->Skip(num_rows_to_skip);
       break;
+    case Type::LARGE_BYTE_ARRAY:
+      num_skipped = static_cast<LargeByteArrayReader*>(reader)->Skip(num_rows_to_skip);
+      break;
     case Type::FIXED_LEN_BYTE_ARRAY:
       num_skipped = static_cast<FixedLenByteArrayReader*>(reader)->Skip(num_rows_to_skip);
       break;
diff --git a/cpp/src/parquet/stream_writer.cc b/cpp/src/parquet/stream_writer.cc
index 856436d701816..d93368740f9a8 100644
--- a/cpp/src/parquet/stream_writer.cc
+++ b/cpp/src/parquet/stream_writer.cc
@@ -251,6 +251,10 @@ void StreamWriter::WriteNullValue(ColumnWriter* writer) {
       static_cast<ByteArrayWriter*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
                                                         &kRepLevelZero, nullptr);
       break;
+    case Type::LARGE_BYTE_ARRAY:
+      static_cast<LargeByteArrayWriter*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
+                                                            &kRepLevelZero, nullptr);
+      break;
     case Type::FIXED_LEN_BYTE_ARRAY:
       static_cast<FixedLenByteArrayWriter*>(writer)->WriteBatch(
           kBatchSizeOne, &kDefLevelZero, &kRepLevelZero, nullptr);
diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc
index 28f472aaf9dd8..d5d0442177934 100644
--- a/cpp/src/parquet/types.cc
+++ b/cpp/src/parquet/types.cc
@@ -260,6 +260,7 @@ SortOrder::type DefaultSortOrder(Type::type primitive) {
     case Type::DOUBLE:
       return SortOrder::SIGNED;
     case Type::BYTE_ARRAY:
+    case Type::LARGE_BYTE_ARRAY:
     case Type::FIXED_LEN_BYTE_ARRAY:
       return SortOrder::UNSIGNED;
     case Type::INT96:
diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h
index 226d0c5730a7b..3a16a84e9e3a0 100644
--- a/cpp/src/parquet/types.h
+++ b/cpp/src/parquet/types.h
@@ -65,10 +65,11 @@ struct Type {
     BYTE_ARRAY = 6,
     FIXED_LEN_BYTE_ARRAY = 7,
 
-    // workaround
+    // This parquet type does not actually exist (AFAIK) and is used to
+    // create proper type traits
     LARGE_BYTE_ARRAY = 8,
     // Should always be last element.
-    UNDEFINED = 8
+    UNDEFINED = 9
   };
 };
 

From c6244eac25e40f64a72483be01ec038dfe2a1e26 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Tue, 30 May 2023 13:52:40 -0300
Subject: [PATCH 10/69] revert some stuff

---
 .../parquet/parquet_arrow/reader_writer.cc    | 39 +++++++------------
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/cpp/examples/parquet/parquet_arrow/reader_writer.cc b/cpp/examples/parquet/parquet_arrow/reader_writer.cc
index 6468edd67534d..debf62736bdd0 100644
--- a/cpp/examples/parquet/parquet_arrow/reader_writer.cc
+++ b/cpp/examples/parquet/parquet_arrow/reader_writer.cc
@@ -60,34 +60,16 @@ void write_parquet_file(const arrow::Table& table) {
 }
 
 // #2: Fully read in the file
-void read_whole_file(const std::string & filename) {
-  std::cout << "Reading " << filename << " at once" << std::endl;
+void read_whole_file() {
+  std::cout << "Reading parquet-arrow-example.parquet at once" << std::endl;
   std::shared_ptr<arrow::io::ReadableFile> infile;
   PARQUET_ASSIGN_OR_THROW(infile,
-                          arrow::io::ReadableFile::Open(filename,
+                          arrow::io::ReadableFile::Open("parquet-arrow-example.parquet",
                                                         arrow::default_memory_pool()));
 
   std::unique_ptr<parquet::arrow::FileReader> reader;
-
-  parquet::arrow::FileReaderBuilder builder;
-
-  parquet::ArrowReaderProperties properties;
-
-  properties.set_use_binary_large_variants(true);
-
-  builder.properties(properties);
-
-  PARQUET_THROW_NOT_OK(builder.Open(infile));
-
-  PARQUET_THROW_NOT_OK(builder.Build(&reader));
-
-//  PARQUET_THROW_NOT_OK(
-//      parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
-
-  std::shared_ptr<::arrow::Schema> schema;
-
-  [[maybe_unused]] auto metadata = reader->GetSchema(&schema);
-
+  PARQUET_THROW_NOT_OK(
+      parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
   std::shared_ptr<arrow::Table> table;
   PARQUET_THROW_NOT_OK(reader->ReadTable(&table));
   std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns()
@@ -113,6 +95,7 @@ void read_single_rowgroup() {
 
 // #4: Read only a single column of the whole parquet file
 void read_single_column() {
+  std::cout << "Reading first column of parquet-arrow-example.parquet" << std::endl;
   std::shared_ptr<arrow::io::ReadableFile> infile;
   PARQUET_ASSIGN_OR_THROW(infile,
                           arrow::io::ReadableFile::Open("parquet-arrow-example.parquet",
@@ -148,6 +131,10 @@ void read_single_column_chunk() {
 }
 
 int main(int argc, char** argv) {
-//  read_whole_file("chunked_jira.parquet");
-  read_whole_file("minimal_repro.parquet");
-}
+  std::shared_ptr<arrow::Table> table = generate_table();
+  write_parquet_file(*table);
+  read_whole_file();
+  read_single_rowgroup();
+  read_single_column();
+  read_single_column_chunk();
+}
\ No newline at end of file

From 5a4bbb0dbae93007ed135dc5377d573a7caeeed6 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Tue, 30 May 2023 13:56:17 -0300
Subject: [PATCH 11/69] revert some stuff

---
 cpp/CMakePresets.json | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json
index 40ccd64a93695..7882be57a0534 100644
--- a/cpp/CMakePresets.json
+++ b/cpp/CMakePresets.json
@@ -220,12 +220,7 @@
         "features-main"
       ],
       "displayName": "Debug build with tests and more optional components",
-      "cacheVariables": {
-        "ARROW_BUILD_EXAMPLES": "ON",
-        "PARQUET_BUILD_EXAMPLES": "ON",
-        "ARROW_BUILD_TESTS": "ON",
-        "ARROW_BUILD_UTILITIES": "ON"
-      }
+      "cacheVariables": {}
     },
     {
       "name": "ninja-debug-cuda",

From 2d84e57149e49e824cc874b5c8aff7c372fd0305 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Tue, 30 May 2023 14:01:43 -0300
Subject: [PATCH 12/69] improvement

---
 cpp/src/parquet/column_reader.cc | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index e50f2262af32e..01197507890b9 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -2355,22 +2355,30 @@ std::shared_ptr<RecordReader> MakeByteArrayRecordReader(const ColumnDescriptor*
                                                         LevelInfo leaf_info,
                                                         ::arrow::MemoryPool* pool,
                                                         bool read_dictionary,
-                                                        bool read_dense_for_nullable,
-                                                        bool use_binary_string_large_variants) {
+                                                        bool read_dense_for_nullable) {
   if (read_dictionary) {
     return std::make_shared<ByteArrayDictionaryRecordReader>(descr, leaf_info, pool,
                                                              read_dense_for_nullable);
   } else {
-    if (use_binary_string_large_variants) {
-      return std::make_shared<LargeByteArrayChunkedRecordReader>(
-          descr, leaf_info, pool, read_dense_for_nullable);
-    }
-
     return std::make_shared<ByteArrayChunkedRecordReader>(descr, leaf_info, pool,
                                                           read_dense_for_nullable);
   }
 }
 
+std::shared_ptr<RecordReader> MakeLargeByteArrayRecordReader(const ColumnDescriptor* descr,
+                                                             LevelInfo leaf_info,
+                                                             ::arrow::MemoryPool* pool,
+                                                             bool read_dictionary,
+                                                             bool read_dense_for_nullable) {
+  if (read_dictionary) {
+    return std::make_shared<LargeByteArrayDictionaryRecordReader>(descr, leaf_info, pool,
+                                                             read_dense_for_nullable);
+  } else {
+    return std::make_shared<LargeByteArrayChunkedRecordReader>(
+        descr, leaf_info, pool, read_dense_for_nullable);
+  }
+}
+
 }  // namespace
 
 std::shared_ptr<RecordReader> RecordReader::Make(const ColumnDescriptor* descr,
@@ -2398,8 +2406,10 @@ std::shared_ptr<RecordReader> RecordReader::Make(const ColumnDescriptor* descr,
       return std::make_shared<TypedRecordReader<DoubleType>>(descr, leaf_info, pool,
                                                              read_dense_for_nullable);
     case Type::BYTE_ARRAY: {
-      return MakeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary,
-                                       read_dense_for_nullable, use_binary_string_large_variants);
+      return use_binary_string_large_variants ? MakeLargeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary,
+                                                                          read_dense_for_nullable)
+                                              : MakeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary,
+                                                                          read_dense_for_nullable);
     }
     case Type::FIXED_LEN_BYTE_ARRAY:
       return std::make_shared<FLBARecordReader>(descr, leaf_info, pool,

From 90f14df903ae8e416aea1d2b6c032b9c501a5fb1 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Tue, 30 May 2023 14:35:44 -0300
Subject: [PATCH 13/69] remove dictionary64

---
 cpp/src/arrow/array/builder_dict.h | 25 +------------------------
 cpp/src/parquet/column_reader.cc   |  2 +-
 cpp/src/parquet/encoding.cc        |  6 +++---
 cpp/src/parquet/encoding.h         |  4 +---
 4 files changed, 6 insertions(+), 31 deletions(-)

diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h
index 9a248dc6fe393..3adf5b843b916 100644
--- a/cpp/src/arrow/array/builder_dict.h
+++ b/cpp/src/arrow/array/builder_dict.h
@@ -715,29 +715,6 @@ class Dictionary32Builder : public internal::DictionaryBuilderBase<Int32Builder,
   }
 };
 
-/// \brief A DictionaryArray builder that always returns int64 dictionary
-/// indices so that data cast to dictionary form will have a consistent index
-/// type, e.g. for creating a ChunkedArray
-template <typename T>
-class Dictionary64Builder : public internal::DictionaryBuilderBase<Int64Builder, T> {
- public:
-  using BASE = internal::DictionaryBuilderBase<Int64Builder, T>;
-  using BASE::BASE;
-
-  /// \brief Append dictionary indices directly without modifying memo
-  ///
-  /// NOTE: Experimental API
-  Status AppendIndices(const int64_t* values, int64_t length,
-                       const uint8_t* valid_bytes = NULLPTR) {
-    int64_t null_count_before = this->indices_builder_.null_count();
-    ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes));
-    this->capacity_ = this->indices_builder_.capacity();
-    this->length_ += length;
-    this->null_count_ += this->indices_builder_.null_count() - null_count_before;
-    return Status::OK();
-  }
-};
-
 // ----------------------------------------------------------------------
 // Binary / Unicode builders
 // (compatibility aliases; those used to be derived classes with additional
@@ -747,7 +724,7 @@ using BinaryDictionaryBuilder = DictionaryBuilder<BinaryType>;
 using StringDictionaryBuilder = DictionaryBuilder<StringType>;
 using BinaryDictionary32Builder = Dictionary32Builder<BinaryType>;
 using StringDictionary32Builder = Dictionary32Builder<StringType>;
-using BinaryDictionary64Builder = Dictionary64Builder<LargeBinaryType>;
+using LargeBinaryDictionary32Builder = Dictionary32Builder<LargeBinaryType>;
 
 /// @}
 
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 01197507890b9..7666cbff5f104 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -2337,7 +2337,7 @@ class LargeByteArrayDictionaryRecordReader : public TypedRecordReader<LargeByteA
  private:
   using LargeBinaryDictDecoder = DictDecoder<LargeByteArrayType>;
 
-  ::arrow::BinaryDictionary64Builder builder_;
+  ::arrow::LargeBinaryDictionary32Builder builder_;
   std::vector<std::shared_ptr<::arrow::Array>> result_chunks_;
 };
 
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index f124db0736875..634ce1496c75e 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -1551,7 +1551,7 @@ class PlainLargeByteArrayDecoder : public PlainDecoder<LargeByteArrayType>,
 
   int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
                   int64_t valid_bits_offset,
-                  ::arrow::BinaryDictionary64Builder* builder) override {
+                  ::arrow::LargeBinaryDictionary32Builder* builder) override {
     int result = 0;
     PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
                                      valid_bits_offset, builder, &result));
@@ -2101,7 +2101,7 @@ void DictDecoderImpl<ByteArrayType>::InsertDictionary(::arrow::ArrayBuilder* bui
 
 template <>
 void DictDecoderImpl<LargeByteArrayType>::InsertDictionary(::arrow::ArrayBuilder* builder) {
-  auto binary_builder = checked_cast<::arrow::BinaryDictionary64Builder*>(builder);
+  auto binary_builder = checked_cast<::arrow::LargeBinaryDictionary32Builder*>(builder);
 
   // Make a BinaryArray referencing the internal dictionary data
   auto arr = std::make_shared<::arrow::LargeBinaryArray>(
@@ -2331,7 +2331,7 @@ class DictLargeByteArrayDecoderImpl : public DictDecoderImpl<LargeByteArrayType>
 
   int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
                   int64_t valid_bits_offset,
-                  ::arrow::BinaryDictionary64Builder* builder) override {
+                  ::arrow::LargeBinaryDictionary32Builder* builder) override {
     int result = 0;
     if (null_count == 0) {
       PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result));
diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h
index 30345852912e8..1218a650238d7 100644
--- a/cpp/src/parquet/encoding.h
+++ b/cpp/src/parquet/encoding.h
@@ -45,8 +45,6 @@ class NumericBuilder;
 class FixedSizeBinaryBuilder;
 template <typename T>
 class Dictionary32Builder;
-template <typename T>
-class Dictionary64Builder;
 
 }  // namespace arrow
 
@@ -167,7 +165,7 @@ struct EncodingTraits<LargeByteArrayType> {
     std::vector<std::shared_ptr<::arrow::Array>> chunks;
   };
   using ArrowType = ::arrow::LargeBinaryType;
-  using DictAccumulator = ::arrow::Dictionary64Builder<::arrow::LargeBinaryType>;
+  using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::LargeBinaryType>;
 };
 
 template <>

From b88b024c9352e1a42daad84472ef602bc639f96b Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Wed, 31 May 2023 10:58:45 -0300
Subject: [PATCH 14/69] use 64bit on largebytearray class and initialize
 binary_large_variant bool to false

---
 cpp/src/parquet/properties.h | 3 ++-
 cpp/src/parquet/types.h      | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index 4da43ffe91b23..26dfcaeb320ad 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -790,7 +790,8 @@ class PARQUET_EXPORT ArrowReaderProperties {
         batch_size_(kArrowDefaultBatchSize),
         pre_buffer_(false),
         cache_options_(::arrow::io::CacheOptions::Defaults()),
-        coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO) {}
+        coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO),
+        use_binary_large_variants_(false) {}
 
   /// \brief Set whether to use the IO thread pool to parse columns in parallel.
   ///
diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h
index 3a16a84e9e3a0..cca6247922ce2 100644
--- a/cpp/src/parquet/types.h
+++ b/cpp/src/parquet/types.h
@@ -594,12 +594,12 @@ inline bool operator!=(const ByteArray& left, const ByteArray& right) {
 
 struct LargeByteArray {
   LargeByteArray() : len(0), ptr(NULLPTR) {}
-  LargeByteArray(uint32_t len, const uint8_t* ptr) : len(len), ptr(ptr) {}
+  LargeByteArray(uint64_t len, const uint8_t* ptr) : len(len), ptr(ptr) {}
 
   LargeByteArray(::std::string_view view)  // NOLINT implicit conversion
       : LargeByteArray(view.size(),
                   reinterpret_cast<const uint8_t*>(view.data())) {}
-  uint32_t len;
+  uint64_t len;
   const uint8_t* ptr;
 };
 

From 0b53b05a1defbe8c5d97668775f099ff654e0f17 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Wed, 31 May 2023 13:48:19 -0300
Subject: [PATCH 15/69] add chunked string map test

---
 .../parquet/arrow/arrow_reader_writer_test.cc | 24 ++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index ad33ca296a283..0196f73e91a92 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -3834,13 +3834,14 @@ TEST(TestImpalaConversion, ArrowTimestampToImpalaTimestamp) {
   ASSERT_EQ(expected, calculated);
 }
 
-void TryReadDataFile(const std::string& path,
-                     ::arrow::StatusCode expected_code = ::arrow::StatusCode::OK) {
+void TryReadDataFileWithProperties(const std::string& path,
+                                   const ArrowReaderProperties& properties,
+                                   ::arrow::StatusCode expected_code = ::arrow::StatusCode::OK) {
   auto pool = ::arrow::default_memory_pool();
 
   std::unique_ptr<FileReader> arrow_reader;
   Status s =
-      FileReader::Make(pool, ParquetFileReader::OpenFile(path, false), &arrow_reader);
+      FileReader::Make(pool, ParquetFileReader::OpenFile(path, false), properties, &arrow_reader);
   if (s.ok()) {
     std::shared_ptr<::arrow::Table> table;
     s = arrow_reader->ReadTable(&table);
@@ -3851,6 +3852,11 @@ void TryReadDataFile(const std::string& path,
       << ", but got " << s.ToString();
 }
 
+void TryReadDataFile(const std::string& path,
+                     ::arrow::StatusCode expected_code = ::arrow::StatusCode::OK) {
+  TryReadDataFileWithProperties(path, default_arrow_reader_properties(), expected_code);
+}
+
 TEST(TestArrowReaderAdHoc, Int96BadMemoryAccess) {
   // PARQUET-995
   TryReadDataFile(test::get_data_file("alltypes_plain.parquet"));
@@ -3862,6 +3868,18 @@ TEST(TestArrowReaderAdHoc, CorruptedSchema) {
   TryReadDataFile(path, ::arrow::StatusCode::IOError);
 }
 
+TEST(TestArrowParquet, LargeByteArray) {
+  auto path = test::get_data_file("chunked_string_map.parquet");
+
+  TryReadDataFile(path, ::arrow::StatusCode::NotImplemented);
+
+  auto reader_properties = default_arrow_reader_properties();
+
+  reader_properties.set_use_binary_large_variants(true);
+
+  TryReadDataFileWithProperties(path, reader_properties);
+}
+
 TEST(TestArrowReaderAdHoc, LARGE_MEMORY_TEST(LargeStringColumn)) {
   // ARROW-3762
   ::arrow::StringBuilder builder;

From f574e2ecd81d1e1c3ae5e1fdada7b41a8a0d6b87 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Wed, 31 May 2023 14:09:12 -0300
Subject: [PATCH 16/69] add boolean comment

---
 cpp/src/parquet/arrow/reader.cc | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index b95be9cdd1415..2de3c9ab4b006 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -463,7 +463,12 @@ class LeafReader : public ColumnReaderImpl {
         input_(std::move(input)),
         descr_(input_->descr()) {
     record_reader_ = RecordReader::Make(
-        descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY, false, ctx_->use_binary_large_variants);
+        descr_,
+        leaf_info,
+        ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY,
+        /*read_dense_for_nullable*/ false,
+        ctx_->use_binary_large_variants
+    );
     NextRowGroup();
   }
 

From 295e062a73ad6a0c6f6a5683f8b71525a0553ae9 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Wed, 31 May 2023 14:53:53 -0300
Subject: [PATCH 17/69] Make ChunkedRecordReader generic by using templates

---
 cpp/src/parquet/column_reader.cc | 87 ++++++++++++++------------------
 1 file changed, 37 insertions(+), 50 deletions(-)

diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 7666cbff5f104..d936cee827ef9 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -2093,57 +2093,41 @@ class FLBARecordReader : public TypedRecordReader<FLBAType>,
   std::unique_ptr<::arrow::FixedSizeBinaryBuilder> builder_;
 };
 
-class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType>,
-                                     virtual public BinaryRecordReader {
- public:
-  ByteArrayChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
-                               ::arrow::MemoryPool* pool, bool read_dense_for_nullable)
-      : TypedRecordReader<ByteArrayType>(descr, leaf_info, pool,
-                                         read_dense_for_nullable) {
-    ARROW_DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY);
-    accumulator_.builder = std::make_unique<::arrow::BinaryBuilder>(pool);
-  }
-
-  ::arrow::ArrayVector GetBuilderChunks() override {
-    ::arrow::ArrayVector result = accumulator_.chunks;
-    if (result.size() == 0 || accumulator_.builder->length() > 0) {
-      std::shared_ptr<::arrow::Array> last_chunk;
-      PARQUET_THROW_NOT_OK(accumulator_.builder->Finish(&last_chunk));
-      result.push_back(std::move(last_chunk));
-    }
-    accumulator_.chunks = {};
-    return result;
-  }
-
-  void ReadValuesDense(int64_t values_to_read) override {
-    int64_t num_decoded = this->current_decoder_->DecodeArrowNonNull(
-        static_cast<int>(values_to_read), &accumulator_);
-    CheckNumberDecoded(num_decoded, values_to_read);
-    ResetValues();
-  }
-
-  void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override {
-    int64_t num_decoded = this->current_decoder_->DecodeArrow(
-        static_cast<int>(values_to_read), static_cast<int>(null_count),
-        valid_bits_->mutable_data(), values_written_, &accumulator_);
-    CheckNumberDecoded(num_decoded, values_to_read - null_count);
-    ResetValues();
-  }
-
- private:
-  // Helper data structure for accumulating builder chunks
-  typename EncodingTraits<ByteArrayType>::Accumulator accumulator_;
+// Below concept could be used to simplify type assertion, but it seems like c++20 is not
+// available
+//template <typename T>
+//concept ByteArrayTypeConcept = std::is_same<T, ByteArrayType>::value ||
+//                               std::is_same<T, LargeByteArrayType>::value;
+
+template<typename T>
+struct IsByteArrayType : std::false_type {};
+
+template<>
+struct IsByteArrayType<ByteArrayType> : std::true_type {};
+
+template<>
+struct IsByteArrayType<LargeByteArrayType> : std::true_type {};
+
+template<typename BAT>
+struct ByteArrayBuilderTypeTrait {
+  using BuilderType = typename std::conditional<std::is_same<BAT, LargeByteArrayType>::value,
+                                                ::arrow::LargeBinaryBuilder,
+                                                ::arrow::BinaryBuilder>::type;
 };
 
-class LargeByteArrayChunkedRecordReader : public TypedRecordReader<LargeByteArrayType>,
-                                          virtual public LargeBinaryRecordReader {
+template<typename BAT>
+class ChunkedRecordReader : public TypedRecordReader<BAT>,
+                            virtual public BinaryRecordReader {
  public:
-  LargeByteArrayChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
-                               ::arrow::MemoryPool* pool, bool read_dense_for_nullable)
-      : TypedRecordReader<LargeByteArrayType>(descr, leaf_info, pool,
+  using BuilderType = typename ByteArrayBuilderTypeTrait<BAT>::BuilderType;
+
+  ChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
+                      ::arrow::MemoryPool* pool, bool read_dense_for_nullable)
+      : TypedRecordReader<BAT>(descr, leaf_info, pool,
                                          read_dense_for_nullable) {
-    ARROW_DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY);
-    accumulator_.builder = std::make_unique<::arrow::LargeBinaryBuilder>(pool);
+    static_assert(IsByteArrayType<BAT>::value, "Invalid ByteArrayType");
+    ARROW_DCHECK_EQ(TypedRecordReader<BAT>::descr_->physical_type(), Type::BYTE_ARRAY);
+    accumulator_.builder = std::make_unique<BuilderType>(pool);
   }
 
   ::arrow::ArrayVector GetBuilderChunks() override {
@@ -2161,7 +2145,7 @@ class LargeByteArrayChunkedRecordReader : public TypedRecordReader<LargeByteArra
     int64_t num_decoded = this->current_decoder_->DecodeArrowNonNull(
         static_cast<int>(values_to_read), &accumulator_);
     CheckNumberDecoded(num_decoded, values_to_read);
-    ResetValues();
+    TypedRecordReader<BAT>::ResetValues();
   }
 
   void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override {
@@ -2169,14 +2153,17 @@ class LargeByteArrayChunkedRecordReader : public TypedRecordReader<LargeByteArra
         static_cast<int>(values_to_read), static_cast<int>(null_count),
         valid_bits_->mutable_data(), values_written_, &accumulator_);
     CheckNumberDecoded(num_decoded, values_to_read - null_count);
-    ResetValues();
+    TypedRecordReader<BAT>::ResetValues();
   }
 
  private:
   // Helper data structure for accumulating builder chunks
-  typename EncodingTraits<LargeByteArrayType>::Accumulator accumulator_;
+  typename EncodingTraits<BAT>::Accumulator accumulator_;
 };
 
+using ByteArrayChunkedRecordReader = ChunkedRecordReader<ByteArrayType>;
+using LargeByteArrayChunkedRecordReader = ChunkedRecordReader<LargeByteArrayType>;
+
 class ByteArrayDictionaryRecordReader : public TypedRecordReader<ByteArrayType>,
                                         virtual public DictionaryRecordReader {
  public:

From 25d7815996b48cbc388250e823a6683f5dc1d851 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Wed, 31 May 2023 16:26:17 -0300
Subject: [PATCH 18/69] Make ByteArrayDictionaryReader generic with the use of
 templates

---
 cpp/src/parquet/column_reader.cc | 105 +++++--------------------------
 1 file changed, 14 insertions(+), 91 deletions(-)

diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index d936cee827ef9..10a63ede4fd59 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -2164,12 +2164,14 @@ class ChunkedRecordReader : public TypedRecordReader<BAT>,
 using ByteArrayChunkedRecordReader = ChunkedRecordReader<ByteArrayType>;
 using LargeByteArrayChunkedRecordReader = ChunkedRecordReader<LargeByteArrayType>;
 
-class ByteArrayDictionaryRecordReader : public TypedRecordReader<ByteArrayType>,
-                                        virtual public DictionaryRecordReader {
+
+template <typename BAT>
+class DictionaryRecordReaderImpl : public TypedRecordReader<BAT>,
+                                  virtual public DictionaryRecordReader {
  public:
-  ByteArrayDictionaryRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
+  DictionaryRecordReaderImpl(const ColumnDescriptor* descr, LevelInfo leaf_info,
                                   ::arrow::MemoryPool* pool, bool read_dense_for_nullable)
-      : TypedRecordReader<ByteArrayType>(descr, leaf_info, pool, read_dense_for_nullable),
+      : TypedRecordReader<BAT>(descr, leaf_info, pool, read_dense_for_nullable),
         builder_(pool) {
     this->read_dictionary_ = true;
   }
@@ -2206,7 +2208,7 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader<ByteArrayType>,
 
   void ReadValuesDense(int64_t values_to_read) override {
     int64_t num_decoded = 0;
-    if (current_encoding_ == Encoding::RLE_DICTIONARY) {
+    if (TypedRecordReader<BAT>::current_encoding_ == Encoding::RLE_DICTIONARY) {
       MaybeWriteNewDictionary();
       auto decoder = dynamic_cast<BinaryDictDecoder*>(this->current_decoder_);
       num_decoded = decoder->DecodeIndices(static_cast<int>(values_to_read), &builder_);
@@ -2215,14 +2217,14 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader<ByteArrayType>,
           static_cast<int>(values_to_read), &builder_);
 
       /// Flush values since they have been copied into the builder
-      ResetValues();
+      TypedRecordReader<BAT>::ResetValues();
     }
     CheckNumberDecoded(num_decoded, values_to_read);
   }
 
   void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override {
     int64_t num_decoded = 0;
-    if (current_encoding_ == Encoding::RLE_DICTIONARY) {
+    if (TypedRecordReader<BAT>::current_encoding_ == Encoding::RLE_DICTIONARY) {
       MaybeWriteNewDictionary();
       auto decoder = dynamic_cast<BinaryDictDecoder*>(this->current_decoder_);
       num_decoded = decoder->DecodeIndicesSpaced(
@@ -2234,99 +2236,20 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader<ByteArrayType>,
           valid_bits_->mutable_data(), values_written_, &builder_);
 
       /// Flush values since they have been copied into the builder
-      ResetValues();
+      TypedRecordReader<BAT>::ResetValues();
     }
     ARROW_DCHECK_EQ(num_decoded, values_to_read - null_count);
   }
 
  private:
-  using BinaryDictDecoder = DictDecoder<ByteArrayType>;
+  using BinaryDictDecoder = DictDecoder<BAT>;
 
-  ::arrow::BinaryDictionary32Builder builder_;
+  typename EncodingTraits<BAT>::DictAccumulator builder_;
   std::vector<std::shared_ptr<::arrow::Array>> result_chunks_;
 };
 
-class LargeByteArrayDictionaryRecordReader : public TypedRecordReader<LargeByteArrayType>,
-                                            virtual public DictionaryRecordReader {
- public:
-  LargeByteArrayDictionaryRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
-                                  ::arrow::MemoryPool* pool, bool read_dense_for_nullable)
-      : TypedRecordReader<LargeByteArrayType>(descr, leaf_info, pool, read_dense_for_nullable),
-        builder_(pool) {
-    this->read_dictionary_ = true;
-  }
-
-  std::shared_ptr<::arrow::ChunkedArray> GetResult() override {
-    FlushBuilder();
-    std::vector<std::shared_ptr<::arrow::Array>> result;
-    std::swap(result, result_chunks_);
-    return std::make_shared<::arrow::ChunkedArray>(std::move(result), builder_.type());
-  }
-
-  void FlushBuilder() {
-    if (builder_.length() > 0) {
-      std::shared_ptr<::arrow::Array> chunk;
-      PARQUET_THROW_NOT_OK(builder_.Finish(&chunk));
-      result_chunks_.emplace_back(std::move(chunk));
-
-      // Also clears the dictionary memo table
-      builder_.Reset();
-    }
-  }
-
-  void MaybeWriteNewDictionary() {
-    if (this->new_dictionary_) {
-      /// If there is a new dictionary, we may need to flush the builder, then
-      /// insert the new dictionary values
-      FlushBuilder();
-      builder_.ResetFull();
-      auto decoder = dynamic_cast<LargeBinaryDictDecoder*>(this->current_decoder_);
-      decoder->InsertDictionary(&builder_);
-      this->new_dictionary_ = false;
-    }
-  }
-
-  void ReadValuesDense(int64_t values_to_read) override {
-    int64_t num_decoded = 0;
-    if (current_encoding_ == Encoding::RLE_DICTIONARY) {
-      MaybeWriteNewDictionary();
-      auto decoder = dynamic_cast<LargeBinaryDictDecoder*>(this->current_decoder_);
-      num_decoded = decoder->DecodeIndices(static_cast<int>(values_to_read), &builder_);
-    } else {
-      num_decoded = this->current_decoder_->DecodeArrowNonNull(
-          static_cast<int>(values_to_read), &builder_);
-
-      /// Flush values since they have been copied into the builder
-      ResetValues();
-    }
-    CheckNumberDecoded(num_decoded, values_to_read);
-  }
-
-  void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override {
-    int64_t num_decoded = 0;
-    if (current_encoding_ == Encoding::RLE_DICTIONARY) {
-      MaybeWriteNewDictionary();
-      auto decoder = dynamic_cast<LargeBinaryDictDecoder*>(this->current_decoder_);
-      num_decoded = decoder->DecodeIndicesSpaced(
-          static_cast<int>(values_to_read), static_cast<int>(null_count),
-          valid_bits_->mutable_data(), values_written_, &builder_);
-    } else {
-      num_decoded = this->current_decoder_->DecodeArrow(
-          static_cast<int>(values_to_read), static_cast<int>(null_count),
-          valid_bits_->mutable_data(), values_written_, &builder_);
-
-      /// Flush values since they have been copied into the builder
-      ResetValues();
-    }
-    ARROW_DCHECK_EQ(num_decoded, values_to_read - null_count);
-  }
-
- private:
-  using LargeBinaryDictDecoder = DictDecoder<LargeByteArrayType>;
-
-  ::arrow::LargeBinaryDictionary32Builder builder_;
-  std::vector<std::shared_ptr<::arrow::Array>> result_chunks_;
-};
+using ByteArrayDictionaryRecordReader = DictionaryRecordReaderImpl<ByteArrayType>;
+using LargeByteArrayDictionaryRecordReader = DictionaryRecordReaderImpl<LargeByteArrayType>;
 
 // TODO(wesm): Implement these to some satisfaction
 template <>

From fe8d67bf7b4e5e8dfd1f25cda36672d177b9d592 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Wed, 31 May 2023 16:55:17 -0300
Subject: [PATCH 19/69] make arrowbinaryhelper generic

---
 cpp/src/parquet/encoding.cc | 49 ++++++-------------------------------
 cpp/src/parquet/encoding.h  | 12 +++++++--
 2 files changed, 18 insertions(+), 43 deletions(-)

diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 634ce1496c75e..eee5a914bf7ff 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -1271,12 +1271,13 @@ int PlainBooleanDecoder::Decode(bool* buffer, int max_values) {
   return max_values;
 }
 
-struct ArrowBinaryHelper {
-  explicit ArrowBinaryHelper(typename EncodingTraits<ByteArrayType>::Accumulator* out) {
+template <typename BAT>
+struct ArrowBinaryHelperBase {
+  explicit ArrowBinaryHelperBase(typename EncodingTraits<BAT>::Accumulator* out) {
     this->out = out;
     this->builder = out->builder.get();
     this->chunk_space_remaining =
-        ::arrow::kBinaryMemoryLimit - this->builder->value_data_length();
+        EncodingTraits<BAT>::memory_limit - this->builder->value_data_length();
   }
 
   Status PushChunk() {
@@ -1303,47 +1304,13 @@ struct ArrowBinaryHelper {
 
   Status AppendNull() { return builder->AppendNull(); }
 
-  typename EncodingTraits<ByteArrayType>::Accumulator* out;
-  ::arrow::BinaryBuilder* builder;
+  typename EncodingTraits<BAT>::Accumulator* out;
+  typename EncodingTraits<BAT>::BinaryBuilder* builder;
   int64_t chunk_space_remaining;
 };
 
-struct ArrowLargeBinaryHelper {
-  explicit ArrowLargeBinaryHelper(typename EncodingTraits<LargeByteArrayType>::Accumulator* out) {
-    this->out = out;
-    this->builder = out->builder.get();
-    this->chunk_space_remaining =
-        ::arrow::kLargeBinaryMemoryLimit - this->builder->value_data_length();
-  }
-
-  Status PushChunk() {
-    std::shared_ptr<::arrow::Array> result;
-    RETURN_NOT_OK(builder->Finish(&result));
-    out->chunks.push_back(result);
-    chunk_space_remaining = ::arrow::kLargeBinaryMemoryLimit;
-    return Status::OK();
-  }
-
-  bool CanFit(int64_t length) const { return length <= chunk_space_remaining; }
-
-  void UnsafeAppend(const uint8_t* data, int64_t length) {
-    chunk_space_remaining -= length;
-    builder->UnsafeAppend(data, length);
-  }
-
-  void UnsafeAppendNull() { builder->UnsafeAppendNull(); }
-
-  Status Append(const uint8_t* data, int64_t length) {
-    chunk_space_remaining -= length;
-    return builder->Append(data, length);
-  }
-
-  Status AppendNull() { return builder->AppendNull(); }
-
-  typename EncodingTraits<LargeByteArrayType>::Accumulator* out;
-  ::arrow::LargeBinaryBuilder* builder;
-  int64_t chunk_space_remaining;
-};
+using ArrowBinaryHelper = ArrowBinaryHelperBase<ByteArrayType>;
+using ArrowLargeBinaryHelper = ArrowBinaryHelperBase<LargeByteArrayType>;
 
 template <>
 inline int PlainDecoder<ByteArrayType>::DecodeArrow(
diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h
index 1218a650238d7..b138c45e40f48 100644
--- a/cpp/src/parquet/encoding.h
+++ b/cpp/src/parquet/encoding.h
@@ -24,6 +24,7 @@
 
 #include "arrow/util/spaced.h"
 
+#include "arrow/type.h"
 #include "parquet/exception.h"
 #include "parquet/platform.h"
 #include "parquet/types.h"
@@ -142,30 +143,37 @@ template <>
 struct EncodingTraits<ByteArrayType> {
   using Encoder = ByteArrayEncoder;
   using Decoder = ByteArrayDecoder;
+  using BinaryBuilder = ::arrow::BinaryBuilder;
 
   /// \brief Internal helper class for decoding BYTE_ARRAY data where we can
   /// overflow the capacity of a single arrow::BinaryArray
   struct Accumulator {
-    std::unique_ptr<::arrow::BinaryBuilder> builder;
+    std::unique_ptr<BinaryBuilder> builder;
     std::vector<std::shared_ptr<::arrow::Array>> chunks;
   };
   using ArrowType = ::arrow::BinaryType;
   using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>;
+
+
+  static constexpr auto memory_limit = ::arrow::kBinaryMemoryLimit;
 };
 
 template <>
 struct EncodingTraits<LargeByteArrayType> {
   using Encoder = LargeByteArrayEncoder;
   using Decoder = LargeByteArrayDecoder;
+  using BinaryBuilder = ::arrow::LargeBinaryBuilder;
 
   /// \brief Internal helper class for decoding BYTE_ARRAY data where we can
   /// overflow the capacity of a single arrow::BinaryArray
   struct Accumulator {
-    std::unique_ptr<::arrow::LargeBinaryBuilder> builder;
+    std::unique_ptr<BinaryBuilder> builder;
     std::vector<std::shared_ptr<::arrow::Array>> chunks;
   };
   using ArrowType = ::arrow::LargeBinaryType;
   using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::LargeBinaryType>;
+
+  static constexpr auto memory_limit = ::arrow::kLargeBinaryMemoryLimit;
 };
 
 template <>

From 35e58356f7c9a5e7dcbc7ac01d83d5772f808467 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Wed, 31 May 2023 17:08:39 -0300
Subject: [PATCH 20/69] Make PlainByteArrayDecoder generic

---
 cpp/src/parquet/encoding.cc | 166 ++++++------------------------------
 1 file changed, 26 insertions(+), 140 deletions(-)

diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index eee5a914bf7ff..6cd9a833bbff6 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -1388,10 +1388,11 @@ inline int PlainDecoder<FLBAType>::DecodeArrow(
   return values_decoded;
 }
 
-class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
-                              virtual public ByteArrayDecoder {
+template <typename BAT>
+class PlainByteArrayDecoderBase : public PlainDecoder<BAT>,
+                                  virtual public TypedDecoder<BAT> {
  public:
-  using Base = PlainDecoder<ByteArrayType>;
+  using Base = PlainDecoder<BAT>;
   using Base::DecodeSpaced;
   using Base::PlainDecoder;
 
@@ -1400,7 +1401,7 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
 
   int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
                   int64_t valid_bits_offset,
-                  ::arrow::BinaryDictionary32Builder* builder) override {
+                  typename EncodingTraits<BAT>::DictAccumulator* builder) override {
     int result = 0;
     PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
                                      valid_bits_offset, builder, &result));
@@ -1412,7 +1413,7 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
 
   int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
                   int64_t valid_bits_offset,
-                  typename EncodingTraits<ByteArrayType>::Accumulator* out) override {
+                  typename EncodingTraits<BAT>::Accumulator* out) override {
     int result = 0;
     PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits,
                                           valid_bits_offset, out, &result));
@@ -1422,28 +1423,28 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
  private:
   Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits,
                           int64_t valid_bits_offset,
-                          typename EncodingTraits<ByteArrayType>::Accumulator* out,
+                          typename EncodingTraits<BAT>::Accumulator* out,
                           int* out_values_decoded) {
-    ArrowBinaryHelper helper(out);
+    ArrowBinaryHelperBase<BAT> helper(out);
     int values_decoded = 0;
 
     RETURN_NOT_OK(helper.builder->Reserve(num_values));
     RETURN_NOT_OK(helper.builder->ReserveData(
-        std::min<int64_t>(len_, helper.chunk_space_remaining)));
+        std::min<int64_t>(PlainDecoder<BAT>::len_, helper.chunk_space_remaining)));
 
     int i = 0;
     RETURN_NOT_OK(VisitNullBitmapInline(
         valid_bits, valid_bits_offset, num_values, null_count,
         [&]() {
-          if (ARROW_PREDICT_FALSE(len_ < 4)) {
+          if (ARROW_PREDICT_FALSE(PlainDecoder<BAT>::len_ < 4)) {
             ParquetException::EofException();
           }
-          auto value_len = SafeLoadAs<int32_t>(data_);
+          auto value_len = SafeLoadAs<int32_t>(PlainDecoder<BAT>::data_);
           if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) {
             return Status::Invalid("Invalid or corrupted value_len '", value_len, "'");
           }
           auto increment = value_len + 4;
-          if (ARROW_PREDICT_FALSE(len_ < increment)) {
+          if (ARROW_PREDICT_FALSE(PlainDecoder<BAT>::len_ < increment)) {
             ParquetException::EofException();
           }
           if (ARROW_PREDICT_FALSE(!helper.CanFit(value_len))) {
@@ -1451,11 +1452,11 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
             RETURN_NOT_OK(helper.PushChunk());
             RETURN_NOT_OK(helper.builder->Reserve(num_values - i));
             RETURN_NOT_OK(helper.builder->ReserveData(
-                std::min<int64_t>(len_, helper.chunk_space_remaining)));
+                std::min<int64_t>(PlainDecoder<BAT>::len_, helper.chunk_space_remaining)));
           }
-          helper.UnsafeAppend(data_ + 4, value_len);
-          data_ += increment;
-          len_ -= increment;
+          helper.UnsafeAppend(PlainDecoder<BAT>::data_ + 4, value_len);
+          PlainDecoder<BAT>::data_ += increment;
+          PlainDecoder<BAT>::len_ -= increment;
           ++values_decoded;
           ++i;
           return Status::OK();
@@ -1466,7 +1467,7 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
           return Status::OK();
         }));
 
-    num_values_ -= values_decoded;
+    PlainDecoder<BAT>::num_values_ -= values_decoded;
     *out_values_decoded = values_decoded;
     return Status::OK();
   }
@@ -1481,148 +1482,33 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
     RETURN_NOT_OK(VisitNullBitmapInline(
         valid_bits, valid_bits_offset, num_values, null_count,
         [&]() {
-          if (ARROW_PREDICT_FALSE(len_ < 4)) {
+          if (ARROW_PREDICT_FALSE(PlainDecoder<BAT>::len_ < 4)) {
             ParquetException::EofException();
           }
-          auto value_len = SafeLoadAs<int32_t>(data_);
+          auto value_len = SafeLoadAs<int32_t>(PlainDecoder<BAT>::data_);
           if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) {
             return Status::Invalid("Invalid or corrupted value_len '", value_len, "'");
           }
           auto increment = value_len + 4;
-          if (ARROW_PREDICT_FALSE(len_ < increment)) {
+          if (ARROW_PREDICT_FALSE(PlainDecoder<BAT>::len_ < increment)) {
             ParquetException::EofException();
           }
-          RETURN_NOT_OK(builder->Append(data_ + 4, value_len));
-          data_ += increment;
-          len_ -= increment;
+          RETURN_NOT_OK(builder->Append(PlainDecoder<BAT>::data_ + 4, value_len));
+          PlainDecoder<BAT>::data_ += increment;
+          PlainDecoder<BAT>::len_ -= increment;
           ++values_decoded;
           return Status::OK();
         },
         [&]() { return builder->AppendNull(); }));
 
-    num_values_ -= values_decoded;
+    PlainDecoder<BAT>::num_values_ -= values_decoded;
     *out_values_decoded = values_decoded;
     return Status::OK();
   }
 };
 
-class PlainLargeByteArrayDecoder : public PlainDecoder<LargeByteArrayType>,
-                                   virtual public LargeByteArrayDecoder {
- public:
-  using Base = PlainDecoder<LargeByteArrayType>;
-  using Base::DecodeSpaced;
-  using Base::PlainDecoder;
-
-  // ----------------------------------------------------------------------
-  // Dictionary read paths
-
-  int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
-                  int64_t valid_bits_offset,
-                  ::arrow::LargeBinaryDictionary32Builder* builder) override {
-    int result = 0;
-    PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
-                                     valid_bits_offset, builder, &result));
-    return result;
-  }
-
-  // ----------------------------------------------------------------------
-  // Optimized dense binary read paths
-
-  int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
-                  int64_t valid_bits_offset,
-                  typename EncodingTraits<LargeByteArrayType>::Accumulator* out) override {
-    int result = 0;
-    PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits,
-                                          valid_bits_offset, out, &result));
-    return result;
-  }
-
- private:
-  Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits,
-                          int64_t valid_bits_offset,
-                          typename EncodingTraits<LargeByteArrayType>::Accumulator* out,
-                          int* out_values_decoded) {
-    ArrowLargeBinaryHelper helper(out);
-    int values_decoded = 0;
-
-    RETURN_NOT_OK(helper.builder->Reserve(num_values));
-    RETURN_NOT_OK(helper.builder->ReserveData(
-        std::min<int64_t>(len_, helper.chunk_space_remaining)));
-
-    int i = 0;
-    RETURN_NOT_OK(VisitNullBitmapInline(
-        valid_bits, valid_bits_offset, num_values, null_count,
-        [&]() {
-          if (ARROW_PREDICT_FALSE(len_ < 4)) {
-            ParquetException::EofException();
-          }
-          auto value_len = SafeLoadAs<int32_t>(data_);
-          if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) {
-            return Status::Invalid("Invalid or corrupted value_len '", value_len, "'");
-          }
-          auto increment = value_len + 4;
-          if (ARROW_PREDICT_FALSE(len_ < increment)) {
-            ParquetException::EofException();
-          }
-          if (ARROW_PREDICT_FALSE(!helper.CanFit(value_len))) {
-            // This element would exceed the capacity of a chunk
-            RETURN_NOT_OK(helper.PushChunk());
-            RETURN_NOT_OK(helper.builder->Reserve(num_values - i));
-            RETURN_NOT_OK(helper.builder->ReserveData(
-                std::min<int64_t>(len_, helper.chunk_space_remaining)));
-          }
-          helper.UnsafeAppend(data_ + 4, value_len);
-          data_ += increment;
-          len_ -= increment;
-          ++values_decoded;
-          ++i;
-          return Status::OK();
-        },
-        [&]() {
-          helper.UnsafeAppendNull();
-          ++i;
-          return Status::OK();
-        }));
-
-    num_values_ -= values_decoded;
-    *out_values_decoded = values_decoded;
-    return Status::OK();
-  }
-
-  template <typename BuilderType>
-  Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
-                     int64_t valid_bits_offset, BuilderType* builder,
-                     int* out_values_decoded) {
-    RETURN_NOT_OK(builder->Reserve(num_values));
-    int values_decoded = 0;
-
-    RETURN_NOT_OK(VisitNullBitmapInline(
-        valid_bits, valid_bits_offset, num_values, null_count,
-        [&]() {
-          if (ARROW_PREDICT_FALSE(len_ < 4)) {
-            ParquetException::EofException();
-          }
-          auto value_len = SafeLoadAs<int32_t>(data_);
-          if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) {
-            return Status::Invalid("Invalid or corrupted value_len '", value_len, "'");
-          }
-          auto increment = value_len + 4;
-          if (ARROW_PREDICT_FALSE(len_ < increment)) {
-            ParquetException::EofException();
-          }
-          RETURN_NOT_OK(builder->Append(data_ + 4, value_len));
-          data_ += increment;
-          len_ -= increment;
-          ++values_decoded;
-          return Status::OK();
-        },
-        [&]() { return builder->AppendNull(); }));
-
-    num_values_ -= values_decoded;
-    *out_values_decoded = values_decoded;
-    return Status::OK();
-  }
-};
+using PlainByteArrayDecoder = PlainByteArrayDecoderBase<ByteArrayType>;
+using PlainLargeByteArrayDecoder = PlainByteArrayDecoderBase<LargeByteArrayType>;
 
 class PlainFLBADecoder : public PlainDecoder<FLBAType>, virtual public FLBADecoder {
  public:

From 9aff2f35cae8bd29721462d231f110b3b7f1edbd Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Thu, 1 Jun 2023 09:47:21 -0300
Subject: [PATCH 21/69] remove use_binary_large_variant from parquet reader
 properties

---
 cpp/src/parquet/properties.h | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index 26dfcaeb320ad..1a56064a0864e 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -116,12 +116,6 @@ class PARQUET_EXPORT ReaderProperties {
     page_checksum_verification_ = check_crc;
   }
 
-  bool use_binary_large_variants() const { return use_binary_large_variants_; }
-
-  void set_use_binary_large_variants(bool use_binary_large_variants) {
-    use_binary_large_variants_ = use_binary_large_variants;
-  }
-
  private:
   MemoryPool* pool_;
   int64_t buffer_size_ = kDefaultBufferSize;

From eb850c4b44f7138c2f495cedabec332fdf9db2d3 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Mon, 5 Jun 2023 17:27:17 -0300
Subject: [PATCH 22/69] removed parquet::type::large_Byte_array

---
 cpp/src/parquet/encoding.cc      | 21 ++++++++++++--------
 cpp/src/parquet/encoding.h       | 10 ++++++----
 cpp/src/parquet/metadata.cc      |  2 --
 cpp/src/parquet/page_index.cc    |  6 ++----
 cpp/src/parquet/stream_reader.cc |  4 +---
 cpp/src/parquet/stream_writer.cc |  5 +----
 cpp/src/parquet/types.cc         |  1 -
 cpp/src/parquet/types.h          | 34 ++++++++++++++++++++------------
 8 files changed, 44 insertions(+), 39 deletions(-)

diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 6cd9a833bbff6..c1d1cda8f99a1 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -3744,7 +3744,7 @@ std::unique_ptr<Encoder> MakeEncoder(Type::type type_num, Encoding::type encodin
 
 std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encoding,
                                      const ColumnDescriptor* descr,
-                                     ::arrow::MemoryPool* pool) {
+                                     ::arrow::MemoryPool* pool, bool use_binary_large_variant) {
   if (encoding == Encoding::PLAIN) {
     switch (type_num) {
       case Type::BOOLEAN:
@@ -3760,9 +3760,11 @@ std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encodin
       case Type::DOUBLE:
         return std::make_unique<PlainDecoder<DoubleType>>(descr);
       case Type::BYTE_ARRAY:
-        return std::make_unique<PlainByteArrayDecoder>(descr);
-      case Type::LARGE_BYTE_ARRAY:
-        return std::make_unique<PlainLargeByteArrayDecoder>(descr);
+        if (use_binary_large_variant) {
+          return std::make_unique<PlainLargeByteArrayDecoder>(descr);
+        } else {
+          return std::make_unique<PlainByteArrayDecoder>(descr);
+        }
       case Type::FIXED_LEN_BYTE_ARRAY:
         return std::make_unique<PlainFLBADecoder>(descr);
       default:
@@ -3812,7 +3814,8 @@ std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encodin
 namespace detail {
 std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
                                          const ColumnDescriptor* descr,
-                                         MemoryPool* pool) {
+                                         MemoryPool* pool,
+                                         bool use_binary_large_variant) {
   switch (type_num) {
     case Type::BOOLEAN:
       ParquetException::NYI("Dictionary encoding not implemented for boolean type");
@@ -3827,9 +3830,11 @@ std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
     case Type::DOUBLE:
       return std::make_unique<DictDecoderImpl<DoubleType>>(descr, pool);
     case Type::BYTE_ARRAY:
-      return std::make_unique<DictByteArrayDecoderImpl>(descr, pool);
-    case Type::LARGE_BYTE_ARRAY:
-      return std::make_unique<DictLargeByteArrayDecoderImpl>(descr, pool);
+      if (use_binary_large_variant) {
+        return std::make_unique<DictLargeByteArrayDecoderImpl>(descr, pool);
+      } else {
+        return std::make_unique<DictByteArrayDecoderImpl>(descr, pool);
+      }
     case Type::FIXED_LEN_BYTE_ARRAY:
       return std::make_unique<DictDecoderImpl<FLBAType>>(descr, pool);
     default:
diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h
index b138c45e40f48..6ebdd59c35214 100644
--- a/cpp/src/parquet/encoding.h
+++ b/cpp/src/parquet/encoding.h
@@ -462,14 +462,15 @@ std::unique_ptr<typename EncodingTraits<DType>::Encoder> MakeTypedEncoder(
 PARQUET_EXPORT
 std::unique_ptr<Decoder> MakeDecoder(
     Type::type type_num, Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR,
-    ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+    ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), bool use_binary_large_variant = false);
 
 namespace detail {
 
 PARQUET_EXPORT
 std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
                                          const ColumnDescriptor* descr,
-                                         ::arrow::MemoryPool* pool);
+                                         ::arrow::MemoryPool* pool,
+                                         bool use_binary_large_variant);
 
 }  // namespace detail
 
@@ -478,7 +479,7 @@ std::unique_ptr<DictDecoder<DType>> MakeDictDecoder(
     const ColumnDescriptor* descr = NULLPTR,
     ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
   using OutType = DictDecoder<DType>;
-  auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool);
+  auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool, std::is_same_v<DType, LargeByteArrayType>);
   return std::unique_ptr<OutType>(dynamic_cast<OutType*>(decoder.release()));
 }
 
@@ -487,7 +488,8 @@ std::unique_ptr<typename EncodingTraits<DType>::Decoder> MakeTypedDecoder(
     Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR,
     ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
   using OutType = typename EncodingTraits<DType>::Decoder;
-  std::unique_ptr<Decoder> base = MakeDecoder(DType::type_num, encoding, descr, pool);
+
+  std::unique_ptr<Decoder> base = MakeDecoder(DType::type_num, encoding, descr, pool, std::is_same_v<DType, LargeByteArrayType>);
   return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
 }
 
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index 055e679a9b685..0bbd96580774a 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -123,8 +123,6 @@ std::shared_ptr<Statistics> MakeColumnStats(const format::ColumnMetaData& meta_d
       return MakeTypedColumnStats<FloatType>(meta_data, descr);
     case Type::BYTE_ARRAY:
       return MakeTypedColumnStats<ByteArrayType>(meta_data, descr);
-    case Type::LARGE_BYTE_ARRAY:
-      return MakeTypedColumnStats<LargeByteArrayType>(meta_data, descr);
     case Type::FIXED_LEN_BYTE_ARRAY:
       return MakeTypedColumnStats<FLBAType>(meta_data, descr);
     case Type::UNDEFINED:
diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc
index 969db469bbeb5..f3bca027dac5b 100644
--- a/cpp/src/parquet/page_index.cc
+++ b/cpp/src/parquet/page_index.cc
@@ -853,8 +853,7 @@ std::unique_ptr<ColumnIndex> ColumnIndex::Make(const ColumnDescriptor& descr,
       return std::make_unique<TypedColumnIndexImpl<DoubleType>>(descr, column_index);
     case Type::BYTE_ARRAY:
       return std::make_unique<TypedColumnIndexImpl<ByteArrayType>>(descr, column_index);
-    case Type::LARGE_BYTE_ARRAY:
-      return std::make_unique<TypedColumnIndexImpl<LargeByteArrayType>>(descr, column_index);
+      // TODO AP FIX ARTHUR PASSOS
     case Type::FIXED_LEN_BYTE_ARRAY:
       return std::make_unique<TypedColumnIndexImpl<FLBAType>>(descr, column_index);
     case Type::UNDEFINED:
@@ -899,8 +898,7 @@ std::unique_ptr<ColumnIndexBuilder> ColumnIndexBuilder::Make(
       return std::make_unique<ColumnIndexBuilderImpl<DoubleType>>(descr);
     case Type::BYTE_ARRAY:
       return std::make_unique<ColumnIndexBuilderImpl<ByteArrayType>>(descr);
-    case Type::LARGE_BYTE_ARRAY:
-      return std::make_unique<ColumnIndexBuilderImpl<LargeByteArrayType>>(descr);
+      // TODO AP FIX ARTHUR PASSOS
     case Type::FIXED_LEN_BYTE_ARRAY:
       return std::make_unique<ColumnIndexBuilderImpl<FLBAType>>(descr);
     case Type::UNDEFINED:
diff --git a/cpp/src/parquet/stream_reader.cc b/cpp/src/parquet/stream_reader.cc
index 66bcf5ca97560..fc22a76ab0ca9 100644
--- a/cpp/src/parquet/stream_reader.cc
+++ b/cpp/src/parquet/stream_reader.cc
@@ -488,9 +488,7 @@ void StreamReader::SkipRowsInColumn(ColumnReader* reader, int64_t num_rows_to_sk
     case Type::BYTE_ARRAY:
       num_skipped = static_cast<ByteArrayReader*>(reader)->Skip(num_rows_to_skip);
       break;
-    case Type::LARGE_BYTE_ARRAY:
-      num_skipped = static_cast<LargeByteArrayReader*>(reader)->Skip(num_rows_to_skip);
-      break;
+      // TODO AP FIX ARTHUR PASSOS
     case Type::FIXED_LEN_BYTE_ARRAY:
       num_skipped = static_cast<FixedLenByteArrayReader*>(reader)->Skip(num_rows_to_skip);
       break;
diff --git a/cpp/src/parquet/stream_writer.cc b/cpp/src/parquet/stream_writer.cc
index d93368740f9a8..e7cf995c4f486 100644
--- a/cpp/src/parquet/stream_writer.cc
+++ b/cpp/src/parquet/stream_writer.cc
@@ -251,10 +251,7 @@ void StreamWriter::WriteNullValue(ColumnWriter* writer) {
       static_cast<ByteArrayWriter*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
                                                         &kRepLevelZero, nullptr);
       break;
-    case Type::LARGE_BYTE_ARRAY:
-      static_cast<LargeByteArrayWriter*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
-                                                            &kRepLevelZero, nullptr);
-      break;
+      // TODO AP FIX ARTHUR PASSOS
     case Type::FIXED_LEN_BYTE_ARRAY:
       static_cast<FixedLenByteArrayWriter*>(writer)->WriteBatch(
           kBatchSizeOne, &kDefLevelZero, &kRepLevelZero, nullptr);
diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc
index d5d0442177934..28f472aaf9dd8 100644
--- a/cpp/src/parquet/types.cc
+++ b/cpp/src/parquet/types.cc
@@ -260,7 +260,6 @@ SortOrder::type DefaultSortOrder(Type::type primitive) {
     case Type::DOUBLE:
       return SortOrder::SIGNED;
     case Type::BYTE_ARRAY:
-    case Type::LARGE_BYTE_ARRAY:
     case Type::FIXED_LEN_BYTE_ARRAY:
       return SortOrder::UNSIGNED;
     case Type::INT96:
diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h
index cca6247922ce2..972979fa29cb9 100644
--- a/cpp/src/parquet/types.h
+++ b/cpp/src/parquet/types.h
@@ -64,10 +64,6 @@ struct Type {
     DOUBLE = 5,
     BYTE_ARRAY = 6,
     FIXED_LEN_BYTE_ARRAY = 7,
-
-    // This parquet type does not actually exist (AFAIK) and is used to
-    // create proper type traits
-    LARGE_BYTE_ARRAY = 8,
     // Should always be last element.
     UNDEFINED = 9
   };
@@ -768,13 +764,13 @@ struct type_traits<Type::BYTE_ARRAY> {
   static constexpr const char* printf_code = "s";
 };
 
-template<>
-struct type_traits<Type::LARGE_BYTE_ARRAY> {
-  using value_type = LargeByteArray;
-
-  static constexpr int value_byte_size = sizeof(LargeByteArray);
-  static constexpr const char* printf_code = "ls";
-};
+//template<>
+//struct type_traits<Type::LARGE_BYTE_ARRAY> {
+//  using value_type = LargeByteArray;
+//
+//  static constexpr int value_byte_size = sizeof(LargeByteArray);
+//  static constexpr const char* printf_code = "ls";
+//};
 
 template <>
 struct type_traits<Type::FIXED_LEN_BYTE_ARRAY> {
@@ -796,8 +792,20 @@ using Int64Type = PhysicalType<Type::INT64>;
 using Int96Type = PhysicalType<Type::INT96>;
 using FloatType = PhysicalType<Type::FLOAT>;
 using DoubleType = PhysicalType<Type::DOUBLE>;
-using ByteArrayType = PhysicalType<Type::BYTE_ARRAY>;
-using LargeByteArrayType = PhysicalType<Type::LARGE_BYTE_ARRAY>;
+
+struct ByteArrayType
+{
+  using c_type = typename type_traits<Type::BYTE_ARRAY>::value_type;
+  static constexpr Type::type type_num = Type::BYTE_ARRAY;
+};
+
+
+struct LargeByteArrayType
+{
+    using c_type = typename type_traits<Type::BYTE_ARRAY>::value_type;
+    static constexpr Type::type type_num = Type::BYTE_ARRAY;
+};
+
 using FLBAType = PhysicalType<Type::FIXED_LEN_BYTE_ARRAY>;
 
 template <typename Type>

From c2aab6304d8e43a9d92125cd9a05d6e88a37b5f5 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Mon, 5 Jun 2023 17:29:12 -0300
Subject: [PATCH 23/69] small adjustment

---
 cpp/src/parquet/types.h | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h
index 972979fa29cb9..c8eb51ec90f53 100644
--- a/cpp/src/parquet/types.h
+++ b/cpp/src/parquet/types.h
@@ -764,14 +764,6 @@ struct type_traits<Type::BYTE_ARRAY> {
   static constexpr const char* printf_code = "s";
 };
 
-//template<>
-//struct type_traits<Type::LARGE_BYTE_ARRAY> {
-//  using value_type = LargeByteArray;
-//
-//  static constexpr int value_byte_size = sizeof(LargeByteArray);
-//  static constexpr const char* printf_code = "ls";
-//};
-
 template <>
 struct type_traits<Type::FIXED_LEN_BYTE_ARRAY> {
   using value_type = FixedLenByteArray;
@@ -792,14 +784,11 @@ using Int64Type = PhysicalType<Type::INT64>;
 using Int96Type = PhysicalType<Type::INT96>;
 using FloatType = PhysicalType<Type::FLOAT>;
 using DoubleType = PhysicalType<Type::DOUBLE>;
+using ByteArrayType = PhysicalType<Type::BYTE_ARRAY>;
 
-struct ByteArrayType
-{
-  using c_type = typename type_traits<Type::BYTE_ARRAY>::value_type;
-  static constexpr Type::type type_num = Type::BYTE_ARRAY;
-};
-
-
+/*
+ * TODO AP add a comment explaining why the below is needed
+ * */
 struct LargeByteArrayType
 {
     using c_type = typename type_traits<Type::BYTE_ARRAY>::value_type;

From 837ed6c88ccdbaf5849b5e798ca8d6bc2c6038ca Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Tue, 6 Jun 2023 08:13:54 -0300
Subject: [PATCH 24/69] remove largebytearray class

---
 cpp/src/parquet/column_scanner.h |  8 --------
 cpp/src/parquet/encoding.cc      | 33 --------------------------------
 cpp/src/parquet/types.h          | 24 -----------------------
 3 files changed, 65 deletions(-)

diff --git a/cpp/src/parquet/column_scanner.h b/cpp/src/parquet/column_scanner.h
index 7bea4ca24d6db..d53435f03cd32 100644
--- a/cpp/src/parquet/column_scanner.h
+++ b/cpp/src/parquet/column_scanner.h
@@ -225,14 +225,6 @@ inline void TypedScanner<ByteArrayType>::FormatValue(void* val, char* buffer, in
   snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
 }
 
-template <>
-inline void TypedScanner<LargeByteArrayType>::FormatValue(void* val, char* buffer, int bufsize,
-                                                     int width) {
-  std::string fmt = format_fwf<LargeByteArrayType>(width);
-  std::string result = LargeByteArrayToString(*reinterpret_cast<LargeByteArray*>(val));
-  snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
-}
-
 template <>
 inline void TypedScanner<FLBAType>::FormatValue(void* val, char* buffer, int bufsize,
                                                 int width) {
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index c1d1cda8f99a1..e620686b30c60 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -1126,39 +1126,6 @@ inline int DecodePlain<ByteArray>(const uint8_t* data, int64_t data_size, int nu
   return bytes_decoded;
 }
 
-static inline int64_t ReadLargeByteArray(const uint8_t* data, int64_t data_size,
-                                    LargeByteArray* out) {
-  if (ARROW_PREDICT_FALSE(data_size < 4)) {
-    ParquetException::EofException();
-  }
-  const int32_t len = SafeLoadAs<int32_t>(data);
-  if (len < 0) {
-    throw ParquetException("Invalid BYTE_ARRAY value");
-  }
-  const int64_t consumed_length = static_cast<int64_t>(len) + 4;
-  if (ARROW_PREDICT_FALSE(data_size < consumed_length)) {
-    ParquetException::EofException();
-  }
-  *out = LargeByteArray{static_cast<uint32_t>(len), data + 4};
-  return consumed_length;
-}
-
-template <>
-inline int DecodePlain<LargeByteArray>(const uint8_t* data, int64_t data_size, int num_values,
-                                  int type_length, LargeByteArray* out) {
-  int bytes_decoded = 0;
-  for (int i = 0; i < num_values; ++i) {
-    const auto increment = ReadLargeByteArray(data, data_size, out + i);
-    if (ARROW_PREDICT_FALSE(increment > INT_MAX - bytes_decoded)) {
-      throw ParquetException("BYTE_ARRAY chunk too large");
-    }
-    data += increment;
-    data_size -= increment;
-    bytes_decoded += static_cast<int>(increment);
-  }
-  return bytes_decoded;
-}
-
 // Template specialization for FIXED_LEN_BYTE_ARRAY. The written values do not
 // own their own data.
 template <>
diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h
index c8eb51ec90f53..bb897b5073f5a 100644
--- a/cpp/src/parquet/types.h
+++ b/cpp/src/parquet/types.h
@@ -588,26 +588,6 @@ inline bool operator!=(const ByteArray& left, const ByteArray& right) {
   return !(left == right);
 }
 
-struct LargeByteArray {
-  LargeByteArray() : len(0), ptr(NULLPTR) {}
-  LargeByteArray(uint64_t len, const uint8_t* ptr) : len(len), ptr(ptr) {}
-
-  LargeByteArray(::std::string_view view)  // NOLINT implicit conversion
-      : LargeByteArray(view.size(),
-                  reinterpret_cast<const uint8_t*>(view.data())) {}
-  uint64_t len;
-  const uint8_t* ptr;
-};
-
-inline bool operator==(const LargeByteArray& left, const LargeByteArray& right) {
-  return left.len == right.len &&
-         (left.len == 0 || std::memcmp(left.ptr, right.ptr, left.len) == 0);
-}
-
-inline bool operator!=(const LargeByteArray& left, const LargeByteArray& right) {
-  return !(left == right);
-}
-
 struct FixedLenByteArray {
   FixedLenByteArray() : ptr(NULLPTR) {}
   explicit FixedLenByteArray(const uint8_t* ptr) : ptr(ptr) {}
@@ -642,10 +622,6 @@ static inline std::string ByteArrayToString(const ByteArray& a) {
   return std::string(reinterpret_cast<const char*>(a.ptr), a.len);
 }
 
-static inline std::string LargeByteArrayToString(const LargeByteArray& a) {
-  return std::string(reinterpret_cast<const char*>(a.ptr), a.len);
-}
-
 static inline void Int96SetNanoSeconds(parquet::Int96& i96, int64_t nanoseconds) {
   std::memcpy(&i96.value, &nanoseconds, sizeof(nanoseconds));
 }

From 35cdb993f4c753c9028699ea925a1fc7ae0c44a5 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Tue, 6 Jun 2023 08:18:53 -0300
Subject: [PATCH 25/69] simplify largebytearraytype a bit

---
 cpp/src/parquet/types.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h
index bb897b5073f5a..6dcc5e082468b 100644
--- a/cpp/src/parquet/types.h
+++ b/cpp/src/parquet/types.h
@@ -765,11 +765,8 @@ using ByteArrayType = PhysicalType<Type::BYTE_ARRAY>;
 /*
  * TODO AP add a comment explaining why the below is needed
  * */
-struct LargeByteArrayType
-{
-    using c_type = typename type_traits<Type::BYTE_ARRAY>::value_type;
-    static constexpr Type::type type_num = Type::BYTE_ARRAY;
-};
+struct LargeByteArrayType : public ByteArrayType
+{};
 
 using FLBAType = PhysicalType<Type::FIXED_LEN_BYTE_ARRAY>;
 

From a5000e17e8509b3269b417005f99aae557687f74 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Tue, 6 Jun 2023 10:01:00 -0300
Subject: [PATCH 26/69] simplify dictbytearraydecoderimpl a bit

---
 cpp/src/parquet/encoding.cc | 576 ++++++++++++------------------------
 1 file changed, 185 insertions(+), 391 deletions(-)

diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index e620686b30c60..1067f5af0c4bc 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -26,6 +26,7 @@
 #include <string_view>
 #include <utility>
 #include <vector>
+#include <type_traits>
 
 #include "arrow/array.h"
 #include "arrow/array/builder_dict.h"
@@ -1929,434 +1930,227 @@ void DictDecoderImpl<LargeByteArrayType>::InsertDictionary(::arrow::ArrayBuilder
   PARQUET_THROW_NOT_OK(binary_builder->InsertMemoValues(*arr));
 }
 
-class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
-                                 virtual public ByteArrayDecoder {
- public:
-  using BASE = DictDecoderImpl<ByteArrayType>;
-  using BASE::DictDecoderImpl;
+template <typename BAT = ByteArrayType>
+class DictByteArrayDecoderImpl : public DictDecoderImpl<BAT>,
+                                 virtual public TypedDecoder<BAT> {
 
-  int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
-                  int64_t valid_bits_offset,
-                  ::arrow::BinaryDictionary32Builder* builder) override {
-    int result = 0;
-    if (null_count == 0) {
-      PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result));
-    } else {
-      PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
-                                       valid_bits_offset, builder, &result));
-    }
-    return result;
-  }
-
-  int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
-                  int64_t valid_bits_offset,
-                  typename EncodingTraits<ByteArrayType>::Accumulator* out) override {
-    int result = 0;
-    if (null_count == 0) {
-      PARQUET_THROW_NOT_OK(DecodeArrowDenseNonNull(num_values, out, &result));
-    } else {
-      PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits,
-                                            valid_bits_offset, out, &result));
-    }
-    return result;
-  }
+   public:
+    using BASE = DictDecoderImpl<BAT>;
+    using BASE::DictDecoderImpl;
+    using BASE::dictionary_;
+    using BASE::idx_decoder_;
+    using BASE::IndexInBounds;
 
- private:
-  Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits,
-                          int64_t valid_bits_offset,
-                          typename EncodingTraits<ByteArrayType>::Accumulator* out,
-                          int* out_num_values) {
-    constexpr int32_t kBufferSize = 1024;
-    int32_t indices[kBufferSize];
-
-    ArrowBinaryHelper helper(out);
-
-    auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
-    int values_decoded = 0;
-    int num_indices = 0;
-    int pos_indices = 0;
-
-    auto visit_valid = [&](int64_t position) -> Status {
-      if (num_indices == pos_indices) {
-        // Refill indices buffer
-        const auto batch_size =
-            std::min<int32_t>(kBufferSize, num_values - null_count - values_decoded);
-        num_indices = idx_decoder_.GetBatch(indices, batch_size);
-        if (ARROW_PREDICT_FALSE(num_indices < 1)) {
-          return Status::Invalid("Invalid number of indices: ", num_indices);
-        }
-        pos_indices = 0;
-      }
-      const auto index = indices[pos_indices++];
-      RETURN_NOT_OK(IndexInBounds(index));
-      const auto& val = dict_values[index];
-      if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) {
-        RETURN_NOT_OK(helper.PushChunk());
+    int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                    int64_t valid_bits_offset,
+                    typename EncodingTraits<BAT>::DictAccumulator* builder) override {
+      int result = 0;
+      if (null_count == 0) {
+        PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result));
+      } else {
+        PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
+                                         valid_bits_offset, builder, &result));
       }
-      RETURN_NOT_OK(helper.Append(val.ptr, static_cast<int32_t>(val.len)));
-      ++values_decoded;
-      return Status::OK();
-    };
+      return result;
+    }
 
-    auto visit_null = [&]() -> Status {
-      RETURN_NOT_OK(helper.AppendNull());
-      return Status::OK();
-    };
-
-    ::arrow::internal::BitBlockCounter bit_blocks(valid_bits, valid_bits_offset,
-                                                  num_values);
-    int64_t position = 0;
-    while (position < num_values) {
-      const auto block = bit_blocks.NextWord();
-      if (block.AllSet()) {
-        for (int64_t i = 0; i < block.length; ++i, ++position) {
-          ARROW_RETURN_NOT_OK(visit_valid(position));
-        }
-      } else if (block.NoneSet()) {
-        for (int64_t i = 0; i < block.length; ++i, ++position) {
-          ARROW_RETURN_NOT_OK(visit_null());
-        }
+    int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                    int64_t valid_bits_offset,
+                    typename EncodingTraits<BAT>::Accumulator* out) override {
+      int result = 0;
+      if (null_count == 0) {
+        PARQUET_THROW_NOT_OK(DecodeArrowDenseNonNull(num_values, out, &result));
       } else {
-        for (int64_t i = 0; i < block.length; ++i, ++position) {
-          if (bit_util::GetBit(valid_bits, valid_bits_offset + position)) {
-            ARROW_RETURN_NOT_OK(visit_valid(position));
-          } else {
-            ARROW_RETURN_NOT_OK(visit_null());
+        PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits,
+                                              valid_bits_offset, out, &result));
+      }
+      return result;
+    }
+
+   private:
+    Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits,
+                            int64_t valid_bits_offset,
+                            typename EncodingTraits<BAT>::Accumulator* out,
+                            int* out_num_values) {
+      constexpr int32_t kBufferSize = 1024;
+      int32_t indices[kBufferSize];
+
+      ArrowBinaryHelperBase<BAT> helper(out);
+
+      auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
+      int values_decoded = 0;
+      int num_indices = 0;
+      int pos_indices = 0;
+
+      auto visit_valid = [&](int64_t position) -> Status {
+        if (num_indices == pos_indices) {
+          // Refill indices buffer
+          const auto batch_size =
+              std::min<int32_t>(kBufferSize, num_values - null_count - values_decoded);
+          num_indices = idx_decoder_.GetBatch(indices, batch_size);
+          if (ARROW_PREDICT_FALSE(num_indices < 1)) {
+            return Status::Invalid("Invalid number of indices: ", num_indices);
           }
+          pos_indices = 0;
         }
-      }
-    }
-
-    *out_num_values = values_decoded;
-    return Status::OK();
-  }
-
-  Status DecodeArrowDenseNonNull(int num_values,
-                                 typename EncodingTraits<ByteArrayType>::Accumulator* out,
-                                 int* out_num_values) {
-    constexpr int32_t kBufferSize = 2048;
-    int32_t indices[kBufferSize];
-    int values_decoded = 0;
-
-    ArrowBinaryHelper helper(out);
-    auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
-
-    while (values_decoded < num_values) {
-      int32_t batch_size = std::min<int32_t>(kBufferSize, num_values - values_decoded);
-      int num_indices = idx_decoder_.GetBatch(indices, batch_size);
-      if (num_indices == 0) ParquetException::EofException();
-      for (int i = 0; i < num_indices; ++i) {
-        auto idx = indices[i];
-        RETURN_NOT_OK(IndexInBounds(idx));
-        const auto& val = dict_values[idx];
+        const auto index = indices[pos_indices++];
+        RETURN_NOT_OK(IndexInBounds(index));
+        const auto& val = dict_values[index];
         if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) {
           RETURN_NOT_OK(helper.PushChunk());
         }
         RETURN_NOT_OK(helper.Append(val.ptr, static_cast<int32_t>(val.len)));
-      }
-      values_decoded += num_indices;
-    }
-    *out_num_values = values_decoded;
-    return Status::OK();
-  }
-
-  template <typename BuilderType>
-  Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
-                     int64_t valid_bits_offset, BuilderType* builder,
-                     int* out_num_values) {
-    constexpr int32_t kBufferSize = 1024;
-    int32_t indices[kBufferSize];
-
-    RETURN_NOT_OK(builder->Reserve(num_values));
-    ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
-
-    auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
-
-    int values_decoded = 0;
-    int num_appended = 0;
-    while (num_appended < num_values) {
-      bool is_valid = bit_reader.IsSet();
-      bit_reader.Next();
-
-      if (is_valid) {
-        int32_t batch_size =
-            std::min<int32_t>(kBufferSize, num_values - num_appended - null_count);
-        int num_indices = idx_decoder_.GetBatch(indices, batch_size);
-
-        int i = 0;
-        while (true) {
-          // Consume all indices
-          if (is_valid) {
-            auto idx = indices[i];
-            RETURN_NOT_OK(IndexInBounds(idx));
-            const auto& val = dict_values[idx];
-            RETURN_NOT_OK(builder->Append(val.ptr, val.len));
-            ++i;
-            ++values_decoded;
-          } else {
-            RETURN_NOT_OK(builder->AppendNull());
-            --null_count;
+        ++values_decoded;
+        return Status::OK();
+      };
+
+      auto visit_null = [&]() -> Status {
+        RETURN_NOT_OK(helper.AppendNull());
+        return Status::OK();
+      };
+
+      ::arrow::internal::BitBlockCounter bit_blocks(valid_bits, valid_bits_offset,
+                                                    num_values);
+      int64_t position = 0;
+      while (position < num_values) {
+        const auto block = bit_blocks.NextWord();
+        if (block.AllSet()) {
+          for (int64_t i = 0; i < block.length; ++i, ++position) {
+            ARROW_RETURN_NOT_OK(visit_valid(position));
           }
-          ++num_appended;
-          if (i == num_indices) {
-            // Do not advance the bit_reader if we have fulfilled the decode
-            // request
-            break;
+        } else if (block.NoneSet()) {
+          for (int64_t i = 0; i < block.length; ++i, ++position) {
+            ARROW_RETURN_NOT_OK(visit_null());
+          }
+        } else {
+          for (int64_t i = 0; i < block.length; ++i, ++position) {
+            if (bit_util::GetBit(valid_bits, valid_bits_offset + position)) {
+              ARROW_RETURN_NOT_OK(visit_valid(position));
+            } else {
+              ARROW_RETURN_NOT_OK(visit_null());
+            }
           }
-          is_valid = bit_reader.IsSet();
-          bit_reader.Next();
-        }
-      } else {
-        RETURN_NOT_OK(builder->AppendNull());
-        --null_count;
-        ++num_appended;
-      }
-    }
-    *out_num_values = values_decoded;
-    return Status::OK();
-  }
-
-  template <typename BuilderType>
-  Status DecodeArrowNonNull(int num_values, BuilderType* builder, int* out_num_values) {
-    constexpr int32_t kBufferSize = 2048;
-    int32_t indices[kBufferSize];
-
-    RETURN_NOT_OK(builder->Reserve(num_values));
-
-    auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
-
-    int values_decoded = 0;
-    while (values_decoded < num_values) {
-      int32_t batch_size = std::min<int32_t>(kBufferSize, num_values - values_decoded);
-      int num_indices = idx_decoder_.GetBatch(indices, batch_size);
-      if (num_indices == 0) ParquetException::EofException();
-      for (int i = 0; i < num_indices; ++i) {
-        auto idx = indices[i];
-        RETURN_NOT_OK(IndexInBounds(idx));
-        const auto& val = dict_values[idx];
-        RETURN_NOT_OK(builder->Append(val.ptr, val.len));
-      }
-      values_decoded += num_indices;
-    }
-    *out_num_values = values_decoded;
-    return Status::OK();
-  }
-};
-
-class DictLargeByteArrayDecoderImpl : public DictDecoderImpl<LargeByteArrayType>,
-                                 virtual public LargeByteArrayDecoder {
- public:
-  using BASE = DictDecoderImpl<LargeByteArrayType>;
-  using BASE::DictDecoderImpl;
-
-  int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
-                  int64_t valid_bits_offset,
-                  ::arrow::LargeBinaryDictionary32Builder* builder) override {
-    int result = 0;
-    if (null_count == 0) {
-      PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result));
-    } else {
-      PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
-                                       valid_bits_offset, builder, &result));
-    }
-    return result;
-  }
-
-  int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
-                  int64_t valid_bits_offset,
-                  typename EncodingTraits<LargeByteArrayType>::Accumulator* out) override {
-    int result = 0;
-    if (null_count == 0) {
-      PARQUET_THROW_NOT_OK(DecodeArrowDenseNonNull(num_values, out, &result));
-    } else {
-      PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits,
-                                            valid_bits_offset, out, &result));
-    }
-    return result;
-  }
-
- private:
-  Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits,
-                          int64_t valid_bits_offset,
-                          typename EncodingTraits<LargeByteArrayType>::Accumulator* out,
-                          int* out_num_values) {
-    constexpr int32_t kBufferSize = 1024;
-    int32_t indices[kBufferSize];
-
-    ArrowLargeBinaryHelper helper(out);
-
-    auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
-    int values_decoded = 0;
-    int num_indices = 0;
-    int pos_indices = 0;
-
-    auto visit_valid = [&](int64_t position) -> Status {
-      if (num_indices == pos_indices) {
-        // Refill indices buffer
-        const auto batch_size =
-            std::min<int32_t>(kBufferSize, num_values - null_count - values_decoded);
-        num_indices = idx_decoder_.GetBatch(indices, batch_size);
-        if (ARROW_PREDICT_FALSE(num_indices < 1)) {
-          return Status::Invalid("Invalid number of indices: ", num_indices);
         }
-        pos_indices = 0;
-      }
-      const auto index = indices[pos_indices++];
-      RETURN_NOT_OK(IndexInBounds(index));
-      const auto& val = dict_values[index];
-      if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) {
-        RETURN_NOT_OK(helper.PushChunk());
       }
-      RETURN_NOT_OK(helper.Append(val.ptr, static_cast<int32_t>(val.len)));
-      ++values_decoded;
-      return Status::OK();
-    };
 
-    auto visit_null = [&]() -> Status {
-      RETURN_NOT_OK(helper.AppendNull());
+      *out_num_values = values_decoded;
       return Status::OK();
-    };
-
-    ::arrow::internal::BitBlockCounter bit_blocks(valid_bits, valid_bits_offset,
-                                                  num_values);
-    int64_t position = 0;
-    while (position < num_values) {
-      const auto block = bit_blocks.NextWord();
-      if (block.AllSet()) {
-        for (int64_t i = 0; i < block.length; ++i, ++position) {
-          ARROW_RETURN_NOT_OK(visit_valid(position));
-        }
-      } else if (block.NoneSet()) {
-        for (int64_t i = 0; i < block.length; ++i, ++position) {
-          ARROW_RETURN_NOT_OK(visit_null());
-        }
-      } else {
-        for (int64_t i = 0; i < block.length; ++i, ++position) {
-          if (bit_util::GetBit(valid_bits, valid_bits_offset + position)) {
-            ARROW_RETURN_NOT_OK(visit_valid(position));
-          } else {
-            ARROW_RETURN_NOT_OK(visit_null());
-          }
-        }
-      }
     }
 
-    *out_num_values = values_decoded;
-    return Status::OK();
-  }
+    Status DecodeArrowDenseNonNull(int num_values,
+                                   typename EncodingTraits<BAT>::Accumulator* out,
+                                   int* out_num_values) {
+      constexpr int32_t kBufferSize = 2048;
+      int32_t indices[kBufferSize];
+      int values_decoded = 0;
 
-  Status DecodeArrowDenseNonNull(int num_values,
-                                 typename EncodingTraits<LargeByteArrayType>::Accumulator* out,
-                                 int* out_num_values) {
-    constexpr int32_t kBufferSize = 2048;
-    int32_t indices[kBufferSize];
-    int values_decoded = 0;
+      ArrowBinaryHelperBase<BAT> helper(out);
+      auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
 
-    ArrowLargeBinaryHelper helper(out);
-    auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
-
-    while (values_decoded < num_values) {
-      int32_t batch_size = std::min<int32_t>(kBufferSize, num_values - values_decoded);
-      int num_indices = idx_decoder_.GetBatch(indices, batch_size);
-      if (num_indices == 0) ParquetException::EofException();
-      for (int i = 0; i < num_indices; ++i) {
-        auto idx = indices[i];
-        RETURN_NOT_OK(IndexInBounds(idx));
-        const auto& val = dict_values[idx];
-        if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) {
-          RETURN_NOT_OK(helper.PushChunk());
+      while (values_decoded < num_values) {
+        int32_t batch_size = std::min<int32_t>(kBufferSize, num_values - values_decoded);
+        int num_indices = idx_decoder_.GetBatch(indices, batch_size);
+        if (num_indices == 0) ParquetException::EofException();
+        for (int i = 0; i < num_indices; ++i) {
+          auto idx = indices[i];
+          RETURN_NOT_OK(IndexInBounds(idx));
+          const auto& val = dict_values[idx];
+          if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) {
+            RETURN_NOT_OK(helper.PushChunk());
+          }
+          RETURN_NOT_OK(helper.Append(val.ptr, static_cast<int32_t>(val.len)));
         }
-        RETURN_NOT_OK(helper.Append(val.ptr, static_cast<int32_t>(val.len)));
+        values_decoded += num_indices;
       }
-      values_decoded += num_indices;
+      *out_num_values = values_decoded;
+      return Status::OK();
     }
-    *out_num_values = values_decoded;
-    return Status::OK();
-  }
-
-  template <typename BuilderType>
-  Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
-                     int64_t valid_bits_offset, BuilderType* builder,
-                     int* out_num_values) {
-    constexpr int32_t kBufferSize = 1024;
-    int32_t indices[kBufferSize];
-
-    RETURN_NOT_OK(builder->Reserve(num_values));
-    ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
-
-    auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
-
-    int values_decoded = 0;
-    int num_appended = 0;
-    while (num_appended < num_values) {
-      bool is_valid = bit_reader.IsSet();
-      bit_reader.Next();
-
-      if (is_valid) {
-        int32_t batch_size =
-            std::min<int32_t>(kBufferSize, num_values - num_appended - null_count);
-        int num_indices = idx_decoder_.GetBatch(indices, batch_size);
 
-        int i = 0;
-        while (true) {
-          // Consume all indices
-          if (is_valid) {
-            auto idx = indices[i];
-            RETURN_NOT_OK(IndexInBounds(idx));
-            const auto& val = dict_values[idx];
-            RETURN_NOT_OK(builder->Append(val.ptr, val.len));
-            ++i;
-            ++values_decoded;
-          } else {
-            RETURN_NOT_OK(builder->AppendNull());
-            --null_count;
+    template <typename BuilderType>
+    Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                       int64_t valid_bits_offset, BuilderType* builder,
+                       int* out_num_values) {
+      constexpr int32_t kBufferSize = 1024;
+      int32_t indices[kBufferSize];
+
+      RETURN_NOT_OK(builder->Reserve(num_values));
+      ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
+
+      auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
+
+      int values_decoded = 0;
+      int num_appended = 0;
+      while (num_appended < num_values) {
+        bool is_valid = bit_reader.IsSet();
+        bit_reader.Next();
+
+        if (is_valid) {
+          int32_t batch_size =
+              std::min<int32_t>(kBufferSize, num_values - num_appended - null_count);
+          int num_indices = idx_decoder_.GetBatch(indices, batch_size);
+
+          int i = 0;
+          while (true) {
+            // Consume all indices
+            if (is_valid) {
+              auto idx = indices[i];
+              RETURN_NOT_OK(IndexInBounds(idx));
+              const auto& val = dict_values[idx];
+              RETURN_NOT_OK(builder->Append(val.ptr, val.len));
+              ++i;
+              ++values_decoded;
+            } else {
+              RETURN_NOT_OK(builder->AppendNull());
+              --null_count;
+            }
+            ++num_appended;
+            if (i == num_indices) {
+              // Do not advance the bit_reader if we have fulfilled the decode
+              // request
+              break;
+            }
+            is_valid = bit_reader.IsSet();
+            bit_reader.Next();
           }
+        } else {
+          RETURN_NOT_OK(builder->AppendNull());
+          --null_count;
           ++num_appended;
-          if (i == num_indices) {
-            // Do not advance the bit_reader if we have fulfilled the decode
-            // request
-            break;
-          }
-          is_valid = bit_reader.IsSet();
-          bit_reader.Next();
         }
-      } else {
-        RETURN_NOT_OK(builder->AppendNull());
-        --null_count;
-        ++num_appended;
       }
+      *out_num_values = values_decoded;
+      return Status::OK();
     }
-    *out_num_values = values_decoded;
-    return Status::OK();
-  }
 
-  template <typename BuilderType>
-  Status DecodeArrowNonNull(int num_values, BuilderType* builder, int* out_num_values) {
-    constexpr int32_t kBufferSize = 2048;
-    int32_t indices[kBufferSize];
+    template <typename BuilderType>
+    Status DecodeArrowNonNull(int num_values, BuilderType* builder, int* out_num_values) {
+      constexpr int32_t kBufferSize = 2048;
+      int32_t indices[kBufferSize];
 
-    RETURN_NOT_OK(builder->Reserve(num_values));
+      RETURN_NOT_OK(builder->Reserve(num_values));
 
-    auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
+      auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
 
-    int values_decoded = 0;
-    while (values_decoded < num_values) {
-      int32_t batch_size = std::min<int32_t>(kBufferSize, num_values - values_decoded);
-      int num_indices = idx_decoder_.GetBatch(indices, batch_size);
-      if (num_indices == 0) ParquetException::EofException();
-      for (int i = 0; i < num_indices; ++i) {
-        auto idx = indices[i];
-        RETURN_NOT_OK(IndexInBounds(idx));
-        const auto& val = dict_values[idx];
-        RETURN_NOT_OK(builder->Append(val.ptr, val.len));
+      int values_decoded = 0;
+      while (values_decoded < num_values) {
+        int32_t batch_size = std::min<int32_t>(kBufferSize, num_values - values_decoded);
+        int num_indices = idx_decoder_.GetBatch(indices, batch_size);
+        if (num_indices == 0) ParquetException::EofException();
+        for (int i = 0; i < num_indices; ++i) {
+          auto idx = indices[i];
+          RETURN_NOT_OK(IndexInBounds(idx));
+          const auto& val = dict_values[idx];
+          RETURN_NOT_OK(builder->Append(val.ptr, val.len));
+        }
+        values_decoded += num_indices;
       }
-      values_decoded += num_indices;
+      *out_num_values = values_decoded;
+      return Status::OK();
     }
-    *out_num_values = values_decoded;
-    return Status::OK();
-  }
 };
 
+using DictLargeByteArrayDecoderImpl = DictByteArrayDecoderImpl<LargeByteArrayType>;
+
 // ----------------------------------------------------------------------
 // DeltaBitPackEncoder
 
@@ -3800,7 +3594,7 @@ std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
       if (use_binary_large_variant) {
         return std::make_unique<DictLargeByteArrayDecoderImpl>(descr, pool);
       } else {
-        return std::make_unique<DictByteArrayDecoderImpl>(descr, pool);
+        return std::make_unique<DictByteArrayDecoderImpl<>>(descr, pool);
       }
     case Type::FIXED_LEN_BYTE_ARRAY:
       return std::make_unique<DictDecoderImpl<FLBAType>>(descr, pool);

From eb71c17fa906733256ad6eb2cf30bc8415d87b23 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Tue, 6 Jun 2023 10:11:04 -0300
Subject: [PATCH 27/69] remove one default argument

---
 cpp/src/parquet/arrow/schema_internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/parquet/arrow/schema_internal.h b/cpp/src/parquet/arrow/schema_internal.h
index d27440ea22301..2173e4ea18ad0 100644
--- a/cpp/src/parquet/arrow/schema_internal.h
+++ b/cpp/src/parquet/arrow/schema_internal.h
@@ -40,7 +40,7 @@ Result<std::shared_ptr<::arrow::DataType>> FromInt64(const LogicalType& logical_
 Result<std::shared_ptr<::arrow::DataType>> GetArrowType(Type::type physical_type,
                                                         const LogicalType& logical_type,
                                                         int type_length,
-                                                        bool use_binary_large_variant = false);
+                                                        bool use_binary_large_variant);
 
 Result<std::shared_ptr<::arrow::DataType>> GetArrowType(
     Type::type physical_type, const LogicalType& logical_type, int type_length,

From 686a3f7d9994e3112955d53a119bd73f265b3372 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Tue, 6 Jun 2023 10:15:08 -0300
Subject: [PATCH 28/69] remove junk code

---
 cpp/src/parquet/arrow/reader_internal.cc | 1 -
 cpp/src/parquet/column_reader.h          | 6 ------
 cpp/src/parquet/column_writer.h          | 1 -
 cpp/src/parquet/encoding.cc              | 1 -
 4 files changed, 9 deletions(-)

diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc
index b9c913bc24291..a294b712a7ce3 100644
--- a/cpp/src/parquet/arrow/reader_internal.cc
+++ b/cpp/src/parquet/arrow/reader_internal.cc
@@ -85,7 +85,6 @@ using ::arrow::internal::SafeLeftShift;
 using ::arrow::util::SafeLoadAs;
 
 using parquet::internal::BinaryRecordReader;
-using parquet::internal::LargeBinaryRecordReader;
 using parquet::internal::DictionaryRecordReader;
 using parquet::internal::RecordReader;
 using parquet::schema::GroupNode;
diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index 2c6dfea9d39a1..471117c1f13e6 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -471,11 +471,6 @@ class BinaryRecordReader : virtual public RecordReader {
   virtual std::vector<std::shared_ptr<::arrow::Array>> GetBuilderChunks() = 0;
 };
 
-class LargeBinaryRecordReader : virtual public BinaryRecordReader {
- public:
-  virtual std::vector<std::shared_ptr<::arrow::Array>> GetBuilderChunks() = 0;
-};
-
 /// \brief Read records directly to dictionary-encoded Arrow form (int32
 /// indices). Only valid for BYTE_ARRAY columns
 class DictionaryRecordReader : virtual public RecordReader {
@@ -492,7 +487,6 @@ using Int96Reader = TypedColumnReader<Int96Type>;
 using FloatReader = TypedColumnReader<FloatType>;
 using DoubleReader = TypedColumnReader<DoubleType>;
 using ByteArrayReader = TypedColumnReader<ByteArrayType>;
-using LargeByteArrayReader = TypedColumnReader<LargeByteArrayType>;
 using FixedLenByteArrayReader = TypedColumnReader<FLBAType>;
 
 }  // namespace parquet
diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h
index 545ecbb6732f8..792b108ac8835 100644
--- a/cpp/src/parquet/column_writer.h
+++ b/cpp/src/parquet/column_writer.h
@@ -233,7 +233,6 @@ using Int96Writer = TypedColumnWriter<Int96Type>;
 using FloatWriter = TypedColumnWriter<FloatType>;
 using DoubleWriter = TypedColumnWriter<DoubleType>;
 using ByteArrayWriter = TypedColumnWriter<ByteArrayType>;
-using LargeByteArrayWriter = TypedColumnWriter<LargeByteArrayType>;
 using FixedLenByteArrayWriter = TypedColumnWriter<FLBAType>;
 
 namespace internal {
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 1067f5af0c4bc..8ea91d4e33794 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -26,7 +26,6 @@
 #include <string_view>
 #include <utility>
 #include <vector>
-#include <type_traits>
 
 #include "arrow/array.h"
 #include "arrow/array/builder_dict.h"

From a61fc32cdfbb420f3506589f649b4922d067ff9d Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Tue, 6 Jun 2023 10:20:46 -0300
Subject: [PATCH 29/69] move use_binary_large_variant check inside
 frombytearray

---
 cpp/src/parquet/arrow/schema_internal.cc | 26 +++++-------------------
 cpp/src/parquet/arrow/schema_internal.h  |  4 ++--
 cpp/src/parquet/types.h                  |  2 +-
 3 files changed, 8 insertions(+), 24 deletions(-)

diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc
index a256ec4a6d7f9..a971f334dccb2 100644
--- a/cpp/src/parquet/arrow/schema_internal.cc
+++ b/cpp/src/parquet/arrow/schema_internal.cc
@@ -110,34 +110,18 @@ Result<std::shared_ptr<ArrowType>> MakeArrowTimestamp(const LogicalType& logical
   }
 }
 
-Result<std::shared_ptr<ArrowType>> FromByteArray(const LogicalType& logical_type) {
+Result<std::shared_ptr<ArrowType>> FromByteArray(const LogicalType& logical_type,
+                                                 bool use_binary_large_variant) {
   switch (logical_type.type()) {
     case LogicalType::Type::STRING:
-      return ::arrow::utf8();
+      return use_binary_large_variant ? ::arrow::large_utf8() : ::arrow::utf8();
     case LogicalType::Type::DECIMAL:
       return MakeArrowDecimal(logical_type);
     case LogicalType::Type::NONE:
     case LogicalType::Type::ENUM:
     case LogicalType::Type::JSON:
     case LogicalType::Type::BSON:
-      return ::arrow::binary();
-    default:
-      return Status::NotImplemented("Unhandled logical logical_type ",
-                                    logical_type.ToString(), " for binary array");
-  }
-}
-
-Result<std::shared_ptr<ArrowType>> FromLargeByteArray(const LogicalType& logical_type) {
-  switch (logical_type.type()) {
-    case LogicalType::Type::STRING:
-      return ::arrow::large_utf8();
-    case LogicalType::Type::DECIMAL:
-      return MakeArrowDecimal(logical_type);
-    case LogicalType::Type::NONE:
-    case LogicalType::Type::ENUM:
-    case LogicalType::Type::JSON:
-    case LogicalType::Type::BSON:
-      return ::arrow::large_binary();
+      return use_binary_large_variant ? ::arrow::large_binary() : ::arrow::binary();
     default:
       return Status::NotImplemented("Unhandled logical logical_type ",
                                     logical_type.ToString(), " for binary array");
@@ -217,7 +201,7 @@ Result<std::shared_ptr<ArrowType>> GetArrowType(
     case ParquetType::DOUBLE:
       return ::arrow::float64();
     case ParquetType::BYTE_ARRAY:
-      return use_binary_large_variant ? FromLargeByteArray(logical_type) : FromByteArray(logical_type);
+      return FromByteArray(logical_type, use_binary_large_variant);
     case ParquetType::FIXED_LEN_BYTE_ARRAY:
       return FromFLBA(logical_type, type_length);
     default: {
diff --git a/cpp/src/parquet/arrow/schema_internal.h b/cpp/src/parquet/arrow/schema_internal.h
index 2173e4ea18ad0..9bcebc49d3b96 100644
--- a/cpp/src/parquet/arrow/schema_internal.h
+++ b/cpp/src/parquet/arrow/schema_internal.h
@@ -29,8 +29,8 @@ namespace arrow {
 
 using ::arrow::Result;
 
-Result<std::shared_ptr<::arrow::DataType>> FromByteArray(const LogicalType& logical_type);
-Result<std::shared_ptr<::arrow::DataType>> FromLargeByteArray(const LogicalType& logical_type);
+Result<std::shared_ptr<::arrow::DataType>> FromByteArray(const LogicalType& logical_type,
+                                                         bool use_binary_large_variant);
 
 Result<std::shared_ptr<::arrow::DataType>> FromFLBA(const LogicalType& logical_type,
                                                     int32_t physical_length);
diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h
index 6dcc5e082468b..41bf6c903e3e4 100644
--- a/cpp/src/parquet/types.h
+++ b/cpp/src/parquet/types.h
@@ -65,7 +65,7 @@ struct Type {
     BYTE_ARRAY = 6,
     FIXED_LEN_BYTE_ARRAY = 7,
     // Should always be last element.
-    UNDEFINED = 9
+    UNDEFINED
   };
 };
 

From e2600d0620345afad064a8da17fce0015e35c022 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Tue, 6 Jun 2023 10:36:25 -0300
Subject: [PATCH 30/69] simplify chunkedrecordreader a bit

---
 cpp/src/parquet/column_reader.cc | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 10a63ede4fd59..e5910e2d79b14 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -2119,6 +2119,9 @@ template<typename BAT>
 class ChunkedRecordReader : public TypedRecordReader<BAT>,
                             virtual public BinaryRecordReader {
  public:
+  using BASE = TypedRecordReader<BAT>;
+  using BASE::descr_;
+  using BASE::ResetValues;
   using BuilderType = typename ByteArrayBuilderTypeTrait<BAT>::BuilderType;
 
   ChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
@@ -2126,7 +2129,7 @@ class ChunkedRecordReader : public TypedRecordReader<BAT>,
       : TypedRecordReader<BAT>(descr, leaf_info, pool,
                                          read_dense_for_nullable) {
     static_assert(IsByteArrayType<BAT>::value, "Invalid ByteArrayType");
-    ARROW_DCHECK_EQ(TypedRecordReader<BAT>::descr_->physical_type(), Type::BYTE_ARRAY);
+    ARROW_DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY);
     accumulator_.builder = std::make_unique<BuilderType>(pool);
   }
 
@@ -2145,7 +2148,7 @@ class ChunkedRecordReader : public TypedRecordReader<BAT>,
     int64_t num_decoded = this->current_decoder_->DecodeArrowNonNull(
         static_cast<int>(values_to_read), &accumulator_);
     CheckNumberDecoded(num_decoded, values_to_read);
-    TypedRecordReader<BAT>::ResetValues();
+    ResetValues();
   }
 
   void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override {
@@ -2153,7 +2156,7 @@ class ChunkedRecordReader : public TypedRecordReader<BAT>,
         static_cast<int>(values_to_read), static_cast<int>(null_count),
         valid_bits_->mutable_data(), values_written_, &accumulator_);
     CheckNumberDecoded(num_decoded, values_to_read - null_count);
-    TypedRecordReader<BAT>::ResetValues();
+    ResetValues();
   }
 
  private:

From 3b86e23e4bbc1079b78f901b1415e3c7aeea432a Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Tue, 6 Jun 2023 11:05:26 -0300
Subject: [PATCH 31/69] simplify DictionaryRecordReaderImpl and fix
 DebugPrintState

---
 cpp/src/parquet/column_reader.cc | 69 ++++++++++++++++++--------------
 1 file changed, 38 insertions(+), 31 deletions(-)

diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index e5910e2d79b14..7dd31632ba14d 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -1988,33 +1988,33 @@ class TypedRecordReader : public TypedColumnReaderImpl<DType>,
   }
 
   void DebugPrintState() override {
-//    const int16_t* def_levels = this->def_levels();
-//    const int16_t* rep_levels = this->rep_levels();
-//    const int64_t total_levels_read = levels_position_;
-//
-//    const T* vals = reinterpret_cast<const T*>(this->values());
-//
-//    if (leaf_info_.def_level > 0) {
-//      std::cout << "def levels: ";
-//      for (int64_t i = 0; i < total_levels_read; ++i) {
-//        std::cout << def_levels[i] << " ";
-//      }
-//      std::cout << std::endl;
-//    }
-//
-//    if (leaf_info_.rep_level > 0) {
-//      std::cout << "rep levels: ";
-//      for (int64_t i = 0; i < total_levels_read; ++i) {
-//        std::cout << rep_levels[i] << " ";
-//      }
-//      std::cout << std::endl;
-//    }
-//
-//    std::cout << "values: ";
-//    for (int64_t i = 0; i < this->values_written(); ++i) {
-////      std::cout << vals[i] << " ";
-//    }
-//    std::cout << std::endl;
+    const int16_t* def_levels = this->def_levels();
+    const int16_t* rep_levels = this->rep_levels();
+    const int64_t total_levels_read = levels_position_;
+
+    const T* vals = reinterpret_cast<const T*>(this->values());
+
+    if (leaf_info_.def_level > 0) {
+      std::cout << "def levels: ";
+      for (int64_t i = 0; i < total_levels_read; ++i) {
+        std::cout << def_levels[i] << " ";
+      }
+      std::cout << std::endl;
+    }
+
+    if (leaf_info_.rep_level > 0) {
+      std::cout << "rep levels: ";
+      for (int64_t i = 0; i < total_levels_read; ++i) {
+        std::cout << rep_levels[i] << " ";
+      }
+      std::cout << std::endl;
+    }
+
+    std::cout << "values: ";
+    for (int64_t i = 0; i < this->values_written(); ++i) {
+      std::cout << vals[i] << " ";
+    }
+    std::cout << std::endl;
   }
 
   void ResetValues() {
@@ -2171,6 +2171,10 @@ using LargeByteArrayChunkedRecordReader = ChunkedRecordReader<LargeByteArrayType
 template <typename BAT>
 class DictionaryRecordReaderImpl : public TypedRecordReader<BAT>,
                                   virtual public DictionaryRecordReader {
+  using BASE = TypedRecordReader<BAT>;
+  using BASE::current_encoding_;
+  using BASE::ResetValues;
+
  public:
   DictionaryRecordReaderImpl(const ColumnDescriptor* descr, LevelInfo leaf_info,
                                   ::arrow::MemoryPool* pool, bool read_dense_for_nullable)
@@ -2211,7 +2215,7 @@ class DictionaryRecordReaderImpl : public TypedRecordReader<BAT>,
 
   void ReadValuesDense(int64_t values_to_read) override {
     int64_t num_decoded = 0;
-    if (TypedRecordReader<BAT>::current_encoding_ == Encoding::RLE_DICTIONARY) {
+    if (current_encoding_ == Encoding::RLE_DICTIONARY) {
       MaybeWriteNewDictionary();
       auto decoder = dynamic_cast<BinaryDictDecoder*>(this->current_decoder_);
       num_decoded = decoder->DecodeIndices(static_cast<int>(values_to_read), &builder_);
@@ -2220,14 +2224,14 @@ class DictionaryRecordReaderImpl : public TypedRecordReader<BAT>,
           static_cast<int>(values_to_read), &builder_);
 
       /// Flush values since they have been copied into the builder
-      TypedRecordReader<BAT>::ResetValues();
+      ResetValues();
     }
     CheckNumberDecoded(num_decoded, values_to_read);
   }
 
   void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override {
     int64_t num_decoded = 0;
-    if (TypedRecordReader<BAT>::current_encoding_ == Encoding::RLE_DICTIONARY) {
+    if (current_encoding_ == Encoding::RLE_DICTIONARY) {
       MaybeWriteNewDictionary();
       auto decoder = dynamic_cast<BinaryDictDecoder*>(this->current_decoder_);
       num_decoded = decoder->DecodeIndicesSpaced(
@@ -2239,7 +2243,7 @@ class DictionaryRecordReaderImpl : public TypedRecordReader<BAT>,
           valid_bits_->mutable_data(), values_written_, &builder_);
 
       /// Flush values since they have been copied into the builder
-      TypedRecordReader<BAT>::ResetValues();
+      ResetValues();
     }
     ARROW_DCHECK_EQ(num_decoded, values_to_read - null_count);
   }
@@ -2261,6 +2265,9 @@ void TypedRecordReader<Int96Type>::DebugPrintState() {}
 template <>
 void TypedRecordReader<ByteArrayType>::DebugPrintState() {}
 
+template <>
+void TypedRecordReader<LargeByteArrayType>::DebugPrintState() {}
+
 template <>
 void TypedRecordReader<FLBAType>::DebugPrintState() {}
 

From cc027b7a152dfcbc5cac25647ba4ed398c502766 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Tue, 6 Jun 2023 11:08:04 -0300
Subject: [PATCH 32/69] simplify PlainByteArrayDecoderBase

---
 cpp/src/parquet/encoding.cc | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 8ea91d4e33794..b49c351b7403a 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -1360,6 +1360,9 @@ class PlainByteArrayDecoderBase : public PlainDecoder<BAT>,
                                   virtual public TypedDecoder<BAT> {
  public:
   using Base = PlainDecoder<BAT>;
+  using Base::len_;
+  using Base::data_;
+  using Base::num_values_;
   using Base::DecodeSpaced;
   using Base::PlainDecoder;
 
@@ -1397,21 +1400,21 @@ class PlainByteArrayDecoderBase : public PlainDecoder<BAT>,
 
     RETURN_NOT_OK(helper.builder->Reserve(num_values));
     RETURN_NOT_OK(helper.builder->ReserveData(
-        std::min<int64_t>(PlainDecoder<BAT>::len_, helper.chunk_space_remaining)));
+        std::min<int64_t>(len_, helper.chunk_space_remaining)));
 
     int i = 0;
     RETURN_NOT_OK(VisitNullBitmapInline(
         valid_bits, valid_bits_offset, num_values, null_count,
         [&]() {
-          if (ARROW_PREDICT_FALSE(PlainDecoder<BAT>::len_ < 4)) {
+          if (ARROW_PREDICT_FALSE(len_ < 4)) {
             ParquetException::EofException();
           }
-          auto value_len = SafeLoadAs<int32_t>(PlainDecoder<BAT>::data_);
+          auto value_len = SafeLoadAs<int32_t>(data_);
           if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) {
             return Status::Invalid("Invalid or corrupted value_len '", value_len, "'");
           }
           auto increment = value_len + 4;
-          if (ARROW_PREDICT_FALSE(PlainDecoder<BAT>::len_ < increment)) {
+          if (ARROW_PREDICT_FALSE(len_ < increment)) {
             ParquetException::EofException();
           }
           if (ARROW_PREDICT_FALSE(!helper.CanFit(value_len))) {
@@ -1419,11 +1422,11 @@ class PlainByteArrayDecoderBase : public PlainDecoder<BAT>,
             RETURN_NOT_OK(helper.PushChunk());
             RETURN_NOT_OK(helper.builder->Reserve(num_values - i));
             RETURN_NOT_OK(helper.builder->ReserveData(
-                std::min<int64_t>(PlainDecoder<BAT>::len_, helper.chunk_space_remaining)));
+                std::min<int64_t>(len_, helper.chunk_space_remaining)));
           }
-          helper.UnsafeAppend(PlainDecoder<BAT>::data_ + 4, value_len);
-          PlainDecoder<BAT>::data_ += increment;
-          PlainDecoder<BAT>::len_ -= increment;
+          helper.UnsafeAppend(data_ + 4, value_len);
+          data_ += increment;
+          len_ -= increment;
           ++values_decoded;
           ++i;
           return Status::OK();
@@ -1434,7 +1437,7 @@ class PlainByteArrayDecoderBase : public PlainDecoder<BAT>,
           return Status::OK();
         }));
 
-    PlainDecoder<BAT>::num_values_ -= values_decoded;
+    num_values_ -= values_decoded;
     *out_values_decoded = values_decoded;
     return Status::OK();
   }
@@ -1449,26 +1452,26 @@ class PlainByteArrayDecoderBase : public PlainDecoder<BAT>,
     RETURN_NOT_OK(VisitNullBitmapInline(
         valid_bits, valid_bits_offset, num_values, null_count,
         [&]() {
-          if (ARROW_PREDICT_FALSE(PlainDecoder<BAT>::len_ < 4)) {
+          if (ARROW_PREDICT_FALSE(len_ < 4)) {
             ParquetException::EofException();
           }
-          auto value_len = SafeLoadAs<int32_t>(PlainDecoder<BAT>::data_);
+          auto value_len = SafeLoadAs<int32_t>(data_);
           if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) {
             return Status::Invalid("Invalid or corrupted value_len '", value_len, "'");
           }
           auto increment = value_len + 4;
-          if (ARROW_PREDICT_FALSE(PlainDecoder<BAT>::len_ < increment)) {
+          if (ARROW_PREDICT_FALSE(len_ < increment)) {
             ParquetException::EofException();
           }
-          RETURN_NOT_OK(builder->Append(PlainDecoder<BAT>::data_ + 4, value_len));
-          PlainDecoder<BAT>::data_ += increment;
-          PlainDecoder<BAT>::len_ -= increment;
+          RETURN_NOT_OK(builder->Append(data_ + 4, value_len));
+          data_ += increment;
+          len_ -= increment;
           ++values_decoded;
           return Status::OK();
         },
         [&]() { return builder->AppendNull(); }));
 
-    PlainDecoder<BAT>::num_values_ -= values_decoded;
+    num_values_ -= values_decoded;
     *out_values_decoded = values_decoded;
     return Status::OK();
   }

From 177db7af52dc51563fb04d3669ed2a49ee0d67d8 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Wed, 7 Jun 2023 11:01:34 -0300
Subject: [PATCH 33/69] remove some todos

---
 cpp/src/parquet/stream_reader.cc | 1 -
 cpp/src/parquet/stream_writer.cc | 1 -
 2 files changed, 2 deletions(-)

diff --git a/cpp/src/parquet/stream_reader.cc b/cpp/src/parquet/stream_reader.cc
index fc22a76ab0ca9..0fecb1bf24615 100644
--- a/cpp/src/parquet/stream_reader.cc
+++ b/cpp/src/parquet/stream_reader.cc
@@ -488,7 +488,6 @@ void StreamReader::SkipRowsInColumn(ColumnReader* reader, int64_t num_rows_to_sk
     case Type::BYTE_ARRAY:
       num_skipped = static_cast<ByteArrayReader*>(reader)->Skip(num_rows_to_skip);
       break;
-      // TODO AP FIX ARTHUR PASSOS
     case Type::FIXED_LEN_BYTE_ARRAY:
       num_skipped = static_cast<FixedLenByteArrayReader*>(reader)->Skip(num_rows_to_skip);
       break;
diff --git a/cpp/src/parquet/stream_writer.cc b/cpp/src/parquet/stream_writer.cc
index e7cf995c4f486..856436d701816 100644
--- a/cpp/src/parquet/stream_writer.cc
+++ b/cpp/src/parquet/stream_writer.cc
@@ -251,7 +251,6 @@ void StreamWriter::WriteNullValue(ColumnWriter* writer) {
       static_cast<ByteArrayWriter*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
                                                         &kRepLevelZero, nullptr);
       break;
-      // TODO AP FIX ARTHUR PASSOS
     case Type::FIXED_LEN_BYTE_ARRAY:
       static_cast<FixedLenByteArrayWriter*>(writer)->WriteBatch(
           kBatchSizeOne, &kDefLevelZero, &kRepLevelZero, nullptr);

From 66223ee1008f5aaa78fd409fea68d96eb65890ed Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Wed, 7 Jun 2023 15:47:31 -0300
Subject: [PATCH 34/69] Add comment explaining why struct LargeByteArrayType
 instead of alias

---
 cpp/src/parquet/types.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h
index 41bf6c903e3e4..f3429746d3ca4 100644
--- a/cpp/src/parquet/types.h
+++ b/cpp/src/parquet/types.h
@@ -763,7 +763,11 @@ using DoubleType = PhysicalType<Type::DOUBLE>;
 using ByteArrayType = PhysicalType<Type::BYTE_ARRAY>;
 
 /*
- * TODO AP add a comment explaining why the below is needed
+ * Parquet does not have a LARGE_BYTE_ARRAY_TYPE, but arrow does.
+ * It is used to store ByteArrays with length > 2^31 - 1.
+ * The below LargeByteArrayType is used by other classes to select the proper
+ * Readers/Writers/Builders/Encoders/Decoders by using the templated EncodingTraits.
+ * Since there is not a parquet equivalent, a struct has to be used as a workaround
  * */
 struct LargeByteArrayType : public ByteArrayType
 {};

From 5cd39d8f629c6d3554e91d212d6a12ad319f0266 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Thu, 8 Jun 2023 10:01:37 -0300
Subject: [PATCH 35/69] address some pr comments

---
 .../parquet/parquet_arrow/reader_writer.cc    |  2 +-
 .../parquet/arrow/arrow_reader_writer_test.cc |  6 +---
 cpp/src/parquet/arrow/reader.cc               |  5 ++-
 cpp/src/parquet/arrow/reader_internal.h       |  2 +-
 cpp/src/parquet/arrow/schema.cc               |  3 +-
 cpp/src/parquet/arrow/schema_internal.cc      | 14 ++++----
 cpp/src/parquet/arrow/schema_internal.h       |  8 ++---
 cpp/src/parquet/column_reader.cc              | 33 +++++++++----------
 cpp/src/parquet/encoding.cc                   | 10 +++---
 cpp/src/parquet/encoding.h                    |  4 +--
 cpp/src/parquet/properties.h                  | 13 ++++----
 cpp/src/parquet/types.h                       |  2 +-
 12 files changed, 48 insertions(+), 54 deletions(-)

diff --git a/cpp/examples/parquet/parquet_arrow/reader_writer.cc b/cpp/examples/parquet/parquet_arrow/reader_writer.cc
index debf62736bdd0..f5d96ec16ca64 100644
--- a/cpp/examples/parquet/parquet_arrow/reader_writer.cc
+++ b/cpp/examples/parquet/parquet_arrow/reader_writer.cc
@@ -137,4 +137,4 @@ int main(int argc, char** argv) {
   read_single_rowgroup();
   read_single_column();
   read_single_column_chunk();
-}
\ No newline at end of file
+}
diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index 0196f73e91a92..a98e0b321be0b 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -3870,13 +3870,9 @@ TEST(TestArrowReaderAdHoc, CorruptedSchema) {
 
 TEST(TestArrowParquet, LargeByteArray) {
   auto path = test::get_data_file("chunked_string_map.parquet");
-
   TryReadDataFile(path, ::arrow::StatusCode::NotImplemented);
-
   auto reader_properties = default_arrow_reader_properties();
-
-  reader_properties.set_use_binary_large_variants(true);
-
+  reader_properties.set_use_large_binary_variants(true);
   TryReadDataFileWithProperties(path, reader_properties);
 }
 
diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 2de3c9ab4b006..b163eaa4850c0 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -219,7 +219,7 @@ class FileReaderImpl : public FileReader {
     ctx->iterator_factory = SomeRowGroupsFactory(row_groups);
     ctx->filter_leaves = true;
     ctx->included_leaves = included_leaves;
-    ctx->use_binary_large_variants = reader_properties_.use_binary_large_variants();
+    ctx->use_large_binary_variants = reader_properties_.use_large_binary_variants();
     return GetReader(manifest_.schema_fields[i], ctx, out);
   }
 
@@ -467,8 +467,7 @@ class LeafReader : public ColumnReaderImpl {
         leaf_info,
         ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY,
         /*read_dense_for_nullable*/ false,
-        ctx_->use_binary_large_variants
-    );
+        ctx_->use_large_binary_variants);
     NextRowGroup();
   }
 
diff --git a/cpp/src/parquet/arrow/reader_internal.h b/cpp/src/parquet/arrow/reader_internal.h
index c5ee54b7c03d4..6a904f3d45b6e 100644
--- a/cpp/src/parquet/arrow/reader_internal.h
+++ b/cpp/src/parquet/arrow/reader_internal.h
@@ -109,7 +109,7 @@ struct ReaderContext {
   FileColumnIteratorFactory iterator_factory;
   bool filter_leaves;
   std::shared_ptr<std::unordered_set<int>> included_leaves;
-  bool use_binary_large_variants = false;
+  bool use_large_binary_variants = false;
 
   bool IncludesLeaf(int leaf_index) const {
     if (this->filter_leaves) {
diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
index 4920bad21f0df..799c9a244ff43 100644
--- a/cpp/src/parquet/arrow/schema.cc
+++ b/cpp/src/parquet/arrow/schema.cc
@@ -473,7 +473,8 @@ ::arrow::Result<std::shared_ptr<ArrowType>> GetTypeForNode(
     SchemaTreeContext* ctx) {
   ASSIGN_OR_RAISE(
       std::shared_ptr<ArrowType> storage_type,
-      GetArrowType(primitive_node, ctx->properties.coerce_int96_timestamp_unit(), ctx->properties.use_binary_large_variants()));
+      GetArrowType(primitive_node, ctx->properties.coerce_int96_timestamp_unit(),
+                   ctx->properties.use_large_binary_variants()));
   if (ctx->properties.read_dictionary(column_index) &&
       IsDictionaryReadSupported(*storage_type)) {
     return ::arrow::dictionary(::arrow::int32(), storage_type);
diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc
index a971f334dccb2..1cf0ce34706ce 100644
--- a/cpp/src/parquet/arrow/schema_internal.cc
+++ b/cpp/src/parquet/arrow/schema_internal.cc
@@ -111,17 +111,17 @@ Result<std::shared_ptr<ArrowType>> MakeArrowTimestamp(const LogicalType& logical
 }
 
 Result<std::shared_ptr<ArrowType>> FromByteArray(const LogicalType& logical_type,
-                                                 bool use_binary_large_variant) {
+                                                 bool use_large_binary_variants) {
   switch (logical_type.type()) {
     case LogicalType::Type::STRING:
-      return use_binary_large_variant ? ::arrow::large_utf8() : ::arrow::utf8();
+      return use_large_binary_variants ? ::arrow::large_utf8() : ::arrow::utf8();
     case LogicalType::Type::DECIMAL:
       return MakeArrowDecimal(logical_type);
     case LogicalType::Type::NONE:
     case LogicalType::Type::ENUM:
     case LogicalType::Type::JSON:
     case LogicalType::Type::BSON:
-      return use_binary_large_variant ? ::arrow::large_binary() : ::arrow::binary();
+      return use_large_binary_variants ? ::arrow::large_binary() : ::arrow::binary();
     default:
       return Status::NotImplemented("Unhandled logical logical_type ",
                                     logical_type.ToString(), " for binary array");
@@ -182,7 +182,7 @@ Result<std::shared_ptr<ArrowType>> FromInt64(const LogicalType& logical_type) {
 
 Result<std::shared_ptr<ArrowType>> GetArrowType(
     Type::type physical_type, const LogicalType& logical_type, int type_length,
-    const ::arrow::TimeUnit::type int96_arrow_time_unit, bool use_binary_large_variant) {
+    const ::arrow::TimeUnit::type int96_arrow_time_unit, bool use_large_binary_variants) {
   if (logical_type.is_invalid() || logical_type.is_null()) {
     return ::arrow::null();
   }
@@ -201,7 +201,7 @@ Result<std::shared_ptr<ArrowType>> GetArrowType(
     case ParquetType::DOUBLE:
       return ::arrow::float64();
     case ParquetType::BYTE_ARRAY:
-      return FromByteArray(logical_type, use_binary_large_variant);
+      return FromByteArray(logical_type, use_large_binary_variants);
     case ParquetType::FIXED_LEN_BYTE_ARRAY:
       return FromFLBA(logical_type, type_length);
     default: {
@@ -215,9 +215,9 @@ Result<std::shared_ptr<ArrowType>> GetArrowType(
 Result<std::shared_ptr<ArrowType>> GetArrowType(
     const schema::PrimitiveNode& primitive,
     const ::arrow::TimeUnit::type int96_arrow_time_unit,
-    bool use_binary_large_variant) {
+    bool use_large_binary_variants) {
   return GetArrowType(primitive.physical_type(), *primitive.logical_type(),
-                      primitive.type_length(), int96_arrow_time_unit, use_binary_large_variant);
+                      primitive.type_length(), int96_arrow_time_unit, use_large_binary_variants);
 }
 
 }  // namespace arrow
diff --git a/cpp/src/parquet/arrow/schema_internal.h b/cpp/src/parquet/arrow/schema_internal.h
index 9bcebc49d3b96..67aecf6e73f1a 100644
--- a/cpp/src/parquet/arrow/schema_internal.h
+++ b/cpp/src/parquet/arrow/schema_internal.h
@@ -30,7 +30,7 @@ namespace arrow {
 using ::arrow::Result;
 
 Result<std::shared_ptr<::arrow::DataType>> FromByteArray(const LogicalType& logical_type,
-                                                         bool use_binary_large_variant);
+                                                         bool use_large_binary_variants);
 
 Result<std::shared_ptr<::arrow::DataType>> FromFLBA(const LogicalType& logical_type,
                                                     int32_t physical_length);
@@ -40,17 +40,17 @@ Result<std::shared_ptr<::arrow::DataType>> FromInt64(const LogicalType& logical_
 Result<std::shared_ptr<::arrow::DataType>> GetArrowType(Type::type physical_type,
                                                         const LogicalType& logical_type,
                                                         int type_length,
-                                                        bool use_binary_large_variant);
+                                                        bool use_large_binary_variants);
 
 Result<std::shared_ptr<::arrow::DataType>> GetArrowType(
     Type::type physical_type, const LogicalType& logical_type, int type_length,
     ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO,
-    bool use_binary_large_variant = false);
+    bool use_large_binary_variants = false);
 
 Result<std::shared_ptr<::arrow::DataType>> GetArrowType(
     const schema::PrimitiveNode& primitive,
     ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO,
-    bool use_binary_large_variant = false);
+    bool use_large_binary_variants = false);
 
 }  // namespace arrow
 }  // namespace parquet
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 7dd31632ba14d..87d8e33d19df8 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -2093,22 +2093,22 @@ class FLBARecordReader : public TypedRecordReader<FLBAType>,
   std::unique_ptr<::arrow::FixedSizeBinaryBuilder> builder_;
 };
 
-// Below concept could be used to simplify type assertion, but it seems like c++20 is not
-// available
+// TODO Below concept could be used to simplify type assertion,
+//  but it requires c++20
 //template <typename T>
 //concept ByteArrayTypeConcept = std::is_same<T, ByteArrayType>::value ||
 //                               std::is_same<T, LargeByteArrayType>::value;
 
-template<typename T>
+template <typename T>
 struct IsByteArrayType : std::false_type {};
 
-template<>
+template <>
 struct IsByteArrayType<ByteArrayType> : std::true_type {};
 
-template<>
+template <>
 struct IsByteArrayType<LargeByteArrayType> : std::true_type {};
 
-template<typename BAT>
+template <typename BAT>
 struct ByteArrayBuilderTypeTrait {
   using BuilderType = typename std::conditional<std::is_same<BAT, LargeByteArrayType>::value,
                                                 ::arrow::LargeBinaryBuilder,
@@ -2116,15 +2116,15 @@ struct ByteArrayBuilderTypeTrait {
 };
 
 template<typename BAT>
-class ChunkedRecordReader : public TypedRecordReader<BAT>,
-                            virtual public BinaryRecordReader {
+class ByteArrayChunkedRecordReaderImpl : public TypedRecordReader<BAT>,
+                                         virtual public BinaryRecordReader {
  public:
   using BASE = TypedRecordReader<BAT>;
   using BASE::descr_;
   using BASE::ResetValues;
   using BuilderType = typename ByteArrayBuilderTypeTrait<BAT>::BuilderType;
 
-  ChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
+  ByteArrayChunkedRecordReaderImpl(const ColumnDescriptor* descr, LevelInfo leaf_info,
                       ::arrow::MemoryPool* pool, bool read_dense_for_nullable)
       : TypedRecordReader<BAT>(descr, leaf_info, pool,
                                          read_dense_for_nullable) {
@@ -2164,19 +2164,18 @@ class ChunkedRecordReader : public TypedRecordReader<BAT>,
   typename EncodingTraits<BAT>::Accumulator accumulator_;
 };
 
-using ByteArrayChunkedRecordReader = ChunkedRecordReader<ByteArrayType>;
-using LargeByteArrayChunkedRecordReader = ChunkedRecordReader<LargeByteArrayType>;
-
+using ByteArrayChunkedRecordReader = ByteArrayChunkedRecordReaderImpl<ByteArrayType>;
+using LargeByteArrayChunkedRecordReader = ByteArrayChunkedRecordReaderImpl<LargeByteArrayType>;
 
 template <typename BAT>
-class DictionaryRecordReaderImpl : public TypedRecordReader<BAT>,
-                                  virtual public DictionaryRecordReader {
+class ByteArrayDictionaryRecordReaderImpl : public TypedRecordReader<BAT>,
+                                            virtual public DictionaryRecordReader {
   using BASE = TypedRecordReader<BAT>;
   using BASE::current_encoding_;
   using BASE::ResetValues;
 
  public:
-  DictionaryRecordReaderImpl(const ColumnDescriptor* descr, LevelInfo leaf_info,
+  ByteArrayDictionaryRecordReaderImpl(const ColumnDescriptor* descr, LevelInfo leaf_info,
                                   ::arrow::MemoryPool* pool, bool read_dense_for_nullable)
       : TypedRecordReader<BAT>(descr, leaf_info, pool, read_dense_for_nullable),
         builder_(pool) {
@@ -2255,8 +2254,8 @@ class DictionaryRecordReaderImpl : public TypedRecordReader<BAT>,
   std::vector<std::shared_ptr<::arrow::Array>> result_chunks_;
 };
 
-using ByteArrayDictionaryRecordReader = DictionaryRecordReaderImpl<ByteArrayType>;
-using LargeByteArrayDictionaryRecordReader = DictionaryRecordReaderImpl<LargeByteArrayType>;
+using ByteArrayDictionaryRecordReader = ByteArrayDictionaryRecordReaderImpl<ByteArrayType>;
+using LargeByteArrayDictionaryRecordReader = ByteArrayDictionaryRecordReaderImpl<LargeByteArrayType>;
 
 // TODO(wesm): Implement these to some satisfaction
 template <>
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index b49c351b7403a..0274531b93f0d 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -1926,7 +1926,7 @@ template <>
 void DictDecoderImpl<LargeByteArrayType>::InsertDictionary(::arrow::ArrayBuilder* builder) {
   auto binary_builder = checked_cast<::arrow::LargeBinaryDictionary32Builder*>(builder);
 
-  // Make a BinaryArray referencing the internal dictionary data
+  // Make a LargeBinaryArray referencing the internal dictionary data
   auto arr = std::make_shared<::arrow::LargeBinaryArray>(
       dictionary_length_, byte_array_offsets_, byte_array_data_);
   PARQUET_THROW_NOT_OK(binary_builder->InsertMemoValues(*arr));
@@ -3507,7 +3507,7 @@ std::unique_ptr<Encoder> MakeEncoder(Type::type type_num, Encoding::type encodin
 
 std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encoding,
                                      const ColumnDescriptor* descr,
-                                     ::arrow::MemoryPool* pool, bool use_binary_large_variant) {
+                                     ::arrow::MemoryPool* pool, bool use_large_binary_variants) {
   if (encoding == Encoding::PLAIN) {
     switch (type_num) {
       case Type::BOOLEAN:
@@ -3523,7 +3523,7 @@ std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encodin
       case Type::DOUBLE:
         return std::make_unique<PlainDecoder<DoubleType>>(descr);
       case Type::BYTE_ARRAY:
-        if (use_binary_large_variant) {
+        if (use_large_binary_variants) {
           return std::make_unique<PlainLargeByteArrayDecoder>(descr);
         } else {
           return std::make_unique<PlainByteArrayDecoder>(descr);
@@ -3578,7 +3578,7 @@ namespace detail {
 std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
                                          const ColumnDescriptor* descr,
                                          MemoryPool* pool,
-                                         bool use_binary_large_variant) {
+                                         bool use_large_binary_variants) {
   switch (type_num) {
     case Type::BOOLEAN:
       ParquetException::NYI("Dictionary encoding not implemented for boolean type");
@@ -3593,7 +3593,7 @@ std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
     case Type::DOUBLE:
       return std::make_unique<DictDecoderImpl<DoubleType>>(descr, pool);
     case Type::BYTE_ARRAY:
-      if (use_binary_large_variant) {
+      if (use_large_binary_variants) {
         return std::make_unique<DictLargeByteArrayDecoderImpl>(descr, pool);
       } else {
         return std::make_unique<DictByteArrayDecoderImpl<>>(descr, pool);
diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h
index 6ebdd59c35214..622f1d939e773 100644
--- a/cpp/src/parquet/encoding.h
+++ b/cpp/src/parquet/encoding.h
@@ -462,7 +462,7 @@ std::unique_ptr<typename EncodingTraits<DType>::Encoder> MakeTypedEncoder(
 PARQUET_EXPORT
 std::unique_ptr<Decoder> MakeDecoder(
     Type::type type_num, Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR,
-    ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), bool use_binary_large_variant = false);
+    ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), bool use_large_binary_variants = false);
 
 namespace detail {
 
@@ -470,7 +470,7 @@ PARQUET_EXPORT
 std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
                                          const ColumnDescriptor* descr,
                                          ::arrow::MemoryPool* pool,
-                                         bool use_binary_large_variant);
+                                         bool use_large_binary_variants);
 
 }  // namespace detail
 
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index 1a56064a0864e..4e55b50375d7e 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -124,7 +124,6 @@ class PARQUET_EXPORT ReaderProperties {
   bool buffered_stream_enabled_ = false;
   bool page_checksum_verification_ = false;
   std::shared_ptr<FileDecryptionProperties> file_decryption_properties_;
-  bool use_binary_large_variants_ = false;
 };
 
 ReaderProperties PARQUET_EXPORT default_reader_properties();
@@ -785,7 +784,7 @@ class PARQUET_EXPORT ArrowReaderProperties {
         pre_buffer_(false),
         cache_options_(::arrow::io::CacheOptions::Defaults()),
         coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO),
-        use_binary_large_variants_(false) {}
+        use_large_binary_variants_(false) {}
 
   /// \brief Set whether to use the IO thread pool to parse columns in parallel.
   ///
@@ -853,12 +852,12 @@ class PARQUET_EXPORT ArrowReaderProperties {
     return coerce_int96_timestamp_unit_;
   }
 
-  void set_use_binary_large_variants(bool use_binary_large_variants) {
-    use_binary_large_variants_ = use_binary_large_variants;
+  void set_use_large_binary_variants(bool use_large_binary_variants) {
+    use_large_binary_variants_ = use_large_binary_variants;
   }
 
-  bool use_binary_large_variants() const {
-    return use_binary_large_variants_;
+  bool use_large_binary_variants() const {
+    return use_large_binary_variants_;
   }
 
  private:
@@ -869,7 +868,7 @@ class PARQUET_EXPORT ArrowReaderProperties {
   ::arrow::io::IOContext io_context_;
   ::arrow::io::CacheOptions cache_options_;
   ::arrow::TimeUnit::type coerce_int96_timestamp_unit_;
-  bool use_binary_large_variants_;
+  bool use_large_binary_variants_;
 };
 
 /// EXPERIMENTAL: Constructs the default ArrowReaderProperties
diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h
index f3429746d3ca4..1665e0c0222a3 100644
--- a/cpp/src/parquet/types.h
+++ b/cpp/src/parquet/types.h
@@ -65,7 +65,7 @@ struct Type {
     BYTE_ARRAY = 6,
     FIXED_LEN_BYTE_ARRAY = 7,
     // Should always be last element.
-    UNDEFINED
+    UNDEFINED = 8
   };
 };
 

From 10890104ca87bfe1cd92ac202f2435b590933cda Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Thu, 8 Jun 2023 14:29:53 -0300
Subject: [PATCH 36/69] address a few more comments

---
 cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 2 +-
 cpp/src/parquet/encoding.cc                       | 1 -
 cpp/src/parquet/types.h                           | 8 +++-----
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index a98e0b321be0b..f92637ac4d406 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -3871,7 +3871,7 @@ TEST(TestArrowReaderAdHoc, CorruptedSchema) {
 TEST(TestArrowParquet, LargeByteArray) {
   auto path = test::get_data_file("chunked_string_map.parquet");
   TryReadDataFile(path, ::arrow::StatusCode::NotImplemented);
-  auto reader_properties = default_arrow_reader_properties();
+  ArrowReaderProperties reader_properties;
   reader_properties.set_use_large_binary_variants(true);
   TryReadDataFileWithProperties(path, reader_properties);
 }
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 0274531b93f0d..b49db31d4bff1 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -1277,7 +1277,6 @@ struct ArrowBinaryHelperBase {
 };
 
 using ArrowBinaryHelper = ArrowBinaryHelperBase<ByteArrayType>;
-using ArrowLargeBinaryHelper = ArrowBinaryHelperBase<LargeByteArrayType>;
 
 template <>
 inline int PlainDecoder<ByteArrayType>::DecodeArrow(
diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h
index 1665e0c0222a3..f24cad9e87bcc 100644
--- a/cpp/src/parquet/types.h
+++ b/cpp/src/parquet/types.h
@@ -763,11 +763,9 @@ using DoubleType = PhysicalType<Type::DOUBLE>;
 using ByteArrayType = PhysicalType<Type::BYTE_ARRAY>;
 
 /*
- * Parquet does not have a LARGE_BYTE_ARRAY_TYPE, but arrow does.
- * It is used to store ByteArrays with length > 2^31 - 1.
- * The below LargeByteArrayType is used by other classes to select the proper
- * Readers/Writers/Builders/Encoders/Decoders by using the templated EncodingTraits.
- * Since there is not a parquet equivalent, a struct has to be used as a workaround
+ * Parquet uses ByteArrayType for variable length strings and binaries and their lengths
+ * will not exceed 2^31 - 1. However, arrow supports StringType/BinaryType and their
+ * large variants (i.e. LargeStringType and LargeBinaryType).
  * */
 struct LargeByteArrayType : public ByteArrayType
 {};

From a6c42ee122e0582f5f3f6b6e6ccb07703a031107 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Thu, 8 Jun 2023 14:57:12 -0300
Subject: [PATCH 37/69] remove arrow-type include & move binarylimit trait

---
 cpp/src/parquet/encoding.cc | 20 ++++++++++++++++++--
 cpp/src/parquet/encoding.h  |  6 ------
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index b49db31d4bff1..3a1c4da937a24 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -1238,20 +1238,36 @@ int PlainBooleanDecoder::Decode(bool* buffer, int max_values) {
   return max_values;
 }
 
+template <typename DType>
+struct ArrowBinaryHelperTraits;
+
+template <>
+struct ArrowBinaryHelperTraits<ByteArrayType>
+{
+  static constexpr auto memory_limit = ::arrow::kBinaryMemoryLimit;
+};
+
+template <>
+struct ArrowBinaryHelperTraits<LargeByteArrayType>
+{
+  static constexpr auto memory_limit = ::arrow::kLargeBinaryMemoryLimit;
+};
+
 template <typename BAT>
 struct ArrowBinaryHelperBase {
+
   explicit ArrowBinaryHelperBase(typename EncodingTraits<BAT>::Accumulator* out) {
     this->out = out;
     this->builder = out->builder.get();
     this->chunk_space_remaining =
-        EncodingTraits<BAT>::memory_limit - this->builder->value_data_length();
+        ArrowBinaryHelperTraits<BAT>::memory_limit - this->builder->value_data_length();
   }
 
   Status PushChunk() {
     std::shared_ptr<::arrow::Array> result;
     RETURN_NOT_OK(builder->Finish(&result));
     out->chunks.push_back(result);
-    chunk_space_remaining = ::arrow::kBinaryMemoryLimit;
+    chunk_space_remaining = ArrowBinaryHelperTraits<BAT>::memory_limit;
     return Status::OK();
   }
 
diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h
index 622f1d939e773..af5425fdc54b3 100644
--- a/cpp/src/parquet/encoding.h
+++ b/cpp/src/parquet/encoding.h
@@ -24,7 +24,6 @@
 
 #include "arrow/util/spaced.h"
 
-#include "arrow/type.h"
 #include "parquet/exception.h"
 #include "parquet/platform.h"
 #include "parquet/types.h"
@@ -153,9 +152,6 @@ struct EncodingTraits<ByteArrayType> {
   };
   using ArrowType = ::arrow::BinaryType;
   using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>;
-
-
-  static constexpr auto memory_limit = ::arrow::kBinaryMemoryLimit;
 };
 
 template <>
@@ -172,8 +168,6 @@ struct EncodingTraits<LargeByteArrayType> {
   };
   using ArrowType = ::arrow::LargeBinaryType;
   using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::LargeBinaryType>;
-
-  static constexpr auto memory_limit = ::arrow::kLargeBinaryMemoryLimit;
 };
 
 template <>

From 15be2a2ce77e16d047c845b3ec3585bebe6850e0 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Thu, 8 Jun 2023 15:22:14 -0300
Subject: [PATCH 38/69] consolidate setdict

---
 cpp/src/parquet/encoding.cc | 84 +++++++++++++++----------------------
 1 file changed, 34 insertions(+), 50 deletions(-)

diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 3a1c4da937a24..f4c35f424c0a9 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -1524,6 +1524,38 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
   // Perform type-specific initiatialization
   void SetDict(TypedDecoder<Type>* dictionary) override;
 
+  template <typename T = Type,
+      typename = std::enable_if_t<std::is_same_v<T, ByteArrayType>
+          || std::is_same_v<T, LargeByteArrayType>>>
+  void SetByteArrayDict(TypedDecoder<Type>* dictionary)
+  {
+    DecodeDict(dictionary);
+
+    auto dict_values = reinterpret_cast<ByteArray*>(dictionary_->mutable_data());
+
+    int total_size = 0;
+    for (int i = 0; i < dictionary_length_; ++i) {
+      total_size += dict_values[i].len;
+    }
+    PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,
+                                                  /*shrink_to_fit=*/false));
+    PARQUET_THROW_NOT_OK(
+        byte_array_offsets_->Resize((dictionary_length_ + 1) * sizeof(int32_t),
+                                    /*shrink_to_fit=*/false));
+
+    int32_t offset = 0;
+    uint8_t* bytes_data = byte_array_data_->mutable_data();
+    int32_t* bytes_offsets =
+        reinterpret_cast<int32_t*>(byte_array_offsets_->mutable_data());
+    for (int i = 0; i < dictionary_length_; ++i) {
+      memcpy(bytes_data + offset, dict_values[i].ptr, dict_values[i].len);
+      bytes_offsets[i] = offset;
+      dict_values[i].ptr = bytes_data + offset;
+      offset += dict_values[i].len;
+    }
+    bytes_offsets[dictionary_length_] = offset;
+  }
+
   void SetData(int num_values, const uint8_t* data, int len) override {
     num_values_ = num_values;
     if (len == 0) {
@@ -1690,60 +1722,12 @@ void DictDecoderImpl<BooleanType>::SetDict(TypedDecoder<BooleanType>* dictionary
 
 template <>
 void DictDecoderImpl<ByteArrayType>::SetDict(TypedDecoder<ByteArrayType>* dictionary) {
-  DecodeDict(dictionary);
-
-  auto dict_values = reinterpret_cast<ByteArray*>(dictionary_->mutable_data());
-
-  int total_size = 0;
-  for (int i = 0; i < dictionary_length_; ++i) {
-    total_size += dict_values[i].len;
-  }
-  PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,
-                                                /*shrink_to_fit=*/false));
-  PARQUET_THROW_NOT_OK(
-      byte_array_offsets_->Resize((dictionary_length_ + 1) * sizeof(int32_t),
-                                  /*shrink_to_fit=*/false));
-
-  int32_t offset = 0;
-  uint8_t* bytes_data = byte_array_data_->mutable_data();
-  int32_t* bytes_offsets =
-      reinterpret_cast<int32_t*>(byte_array_offsets_->mutable_data());
-  for (int i = 0; i < dictionary_length_; ++i) {
-    memcpy(bytes_data + offset, dict_values[i].ptr, dict_values[i].len);
-    bytes_offsets[i] = offset;
-    dict_values[i].ptr = bytes_data + offset;
-    offset += dict_values[i].len;
-  }
-  bytes_offsets[dictionary_length_] = offset;
+  SetByteArrayDict(dictionary);
 }
 
 template <>
 void DictDecoderImpl<LargeByteArrayType>::SetDict(TypedDecoder<LargeByteArrayType>* dictionary) {
-  DecodeDict(dictionary);
-
-  auto dict_values = reinterpret_cast<ByteArray*>(dictionary_->mutable_data());
-
-  int total_size = 0;
-  for (int i = 0; i < dictionary_length_; ++i) {
-    total_size += dict_values[i].len;
-  }
-  PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,
-                                                /*shrink_to_fit=*/false));
-  PARQUET_THROW_NOT_OK(
-      byte_array_offsets_->Resize((dictionary_length_ + 1) * sizeof(int32_t),
-                                  /*shrink_to_fit=*/false));
-
-  int32_t offset = 0;
-  uint8_t* bytes_data = byte_array_data_->mutable_data();
-  int32_t* bytes_offsets =
-      reinterpret_cast<int32_t*>(byte_array_offsets_->mutable_data());
-  for (int i = 0; i < dictionary_length_; ++i) {
-    memcpy(bytes_data + offset, dict_values[i].ptr, dict_values[i].len);
-    bytes_offsets[i] = offset;
-    dict_values[i].ptr = bytes_data + offset;
-    offset += dict_values[i].len;
-  }
-  bytes_offsets[dictionary_length_] = offset;
+  SetByteArrayDict(dictionary);
 }
 
 template <>

From 8d5ba3df29f2f909e74d92c180246b36e0691d90 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Thu, 8 Jun 2023 15:32:30 -0300
Subject: [PATCH 39/69] apply clangformat

---
 .../parquet/arrow/arrow_reader_writer_test.cc |  10 +-
 cpp/src/parquet/arrow/reader.cc               |   7 +-
 cpp/src/parquet/arrow/schema_internal.cc      |   6 +-
 cpp/src/parquet/column_reader.cc              |  56 +--
 cpp/src/parquet/encoding.cc                   | 397 +++++++++---------
 cpp/src/parquet/encoding.h                    |   9 +-
 cpp/src/parquet/properties.h                  |   4 +-
 cpp/src/parquet/types.h                       |   3 +-
 8 files changed, 245 insertions(+), 247 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index f92637ac4d406..7fa6d23414756 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -3834,14 +3834,14 @@ TEST(TestImpalaConversion, ArrowTimestampToImpalaTimestamp) {
   ASSERT_EQ(expected, calculated);
 }
 
-void TryReadDataFileWithProperties(const std::string& path,
-                                   const ArrowReaderProperties& properties,
-                                   ::arrow::StatusCode expected_code = ::arrow::StatusCode::OK) {
+void TryReadDataFileWithProperties(
+    const std::string& path, const ArrowReaderProperties& properties,
+    ::arrow::StatusCode expected_code = ::arrow::StatusCode::OK) {
   auto pool = ::arrow::default_memory_pool();
 
   std::unique_ptr<FileReader> arrow_reader;
-  Status s =
-      FileReader::Make(pool, ParquetFileReader::OpenFile(path, false), properties, &arrow_reader);
+  Status s = FileReader::Make(pool, ParquetFileReader::OpenFile(path, false), properties,
+                              &arrow_reader);
   if (s.ok()) {
     std::shared_ptr<::arrow::Table> table;
     s = arrow_reader->ReadTable(&table);
diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index b163eaa4850c0..e8a3f79aa0e6d 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -463,11 +463,8 @@ class LeafReader : public ColumnReaderImpl {
         input_(std::move(input)),
         descr_(input_->descr()) {
     record_reader_ = RecordReader::Make(
-        descr_,
-        leaf_info,
-        ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY,
-        /*read_dense_for_nullable*/ false,
-        ctx_->use_large_binary_variants);
+        descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY,
+        /*read_dense_for_nullable*/ false, ctx_->use_large_binary_variants);
     NextRowGroup();
   }
 
diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc
index 1cf0ce34706ce..b399b1f83dbdd 100644
--- a/cpp/src/parquet/arrow/schema_internal.cc
+++ b/cpp/src/parquet/arrow/schema_internal.cc
@@ -214,10 +214,10 @@ Result<std::shared_ptr<ArrowType>> GetArrowType(
 
 Result<std::shared_ptr<ArrowType>> GetArrowType(
     const schema::PrimitiveNode& primitive,
-    const ::arrow::TimeUnit::type int96_arrow_time_unit,
-    bool use_large_binary_variants) {
+    const ::arrow::TimeUnit::type int96_arrow_time_unit, bool use_large_binary_variants) {
   return GetArrowType(primitive.physical_type(), *primitive.logical_type(),
-                      primitive.type_length(), int96_arrow_time_unit, use_large_binary_variants);
+                      primitive.type_length(), int96_arrow_time_unit,
+                      use_large_binary_variants);
 }
 
 }  // namespace arrow
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 87d8e33d19df8..6da925bc71ba0 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -2095,8 +2095,8 @@ class FLBARecordReader : public TypedRecordReader<FLBAType>,
 
 // TODO Below concept could be used to simplify type assertion,
 //  but it requires c++20
-//template <typename T>
-//concept ByteArrayTypeConcept = std::is_same<T, ByteArrayType>::value ||
+// template <typename T>
+// concept ByteArrayTypeConcept = std::is_same<T, ByteArrayType>::value ||
 //                               std::is_same<T, LargeByteArrayType>::value;
 
 template <typename T>
@@ -2110,12 +2110,13 @@ struct IsByteArrayType<LargeByteArrayType> : std::true_type {};
 
 template <typename BAT>
 struct ByteArrayBuilderTypeTrait {
-  using BuilderType = typename std::conditional<std::is_same<BAT, LargeByteArrayType>::value,
-                                                ::arrow::LargeBinaryBuilder,
-                                                ::arrow::BinaryBuilder>::type;
+  using BuilderType =
+      typename std::conditional<std::is_same<BAT, LargeByteArrayType>::value,
+                                ::arrow::LargeBinaryBuilder,
+                                ::arrow::BinaryBuilder>::type;
 };
 
-template<typename BAT>
+template <typename BAT>
 class ByteArrayChunkedRecordReaderImpl : public TypedRecordReader<BAT>,
                                          virtual public BinaryRecordReader {
  public:
@@ -2125,9 +2126,9 @@ class ByteArrayChunkedRecordReaderImpl : public TypedRecordReader<BAT>,
   using BuilderType = typename ByteArrayBuilderTypeTrait<BAT>::BuilderType;
 
   ByteArrayChunkedRecordReaderImpl(const ColumnDescriptor* descr, LevelInfo leaf_info,
-                      ::arrow::MemoryPool* pool, bool read_dense_for_nullable)
-      : TypedRecordReader<BAT>(descr, leaf_info, pool,
-                                         read_dense_for_nullable) {
+                                   ::arrow::MemoryPool* pool,
+                                   bool read_dense_for_nullable)
+      : TypedRecordReader<BAT>(descr, leaf_info, pool, read_dense_for_nullable) {
     static_assert(IsByteArrayType<BAT>::value, "Invalid ByteArrayType");
     ARROW_DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY);
     accumulator_.builder = std::make_unique<BuilderType>(pool);
@@ -2165,7 +2166,8 @@ class ByteArrayChunkedRecordReaderImpl : public TypedRecordReader<BAT>,
 };
 
 using ByteArrayChunkedRecordReader = ByteArrayChunkedRecordReaderImpl<ByteArrayType>;
-using LargeByteArrayChunkedRecordReader = ByteArrayChunkedRecordReaderImpl<LargeByteArrayType>;
+using LargeByteArrayChunkedRecordReader =
+    ByteArrayChunkedRecordReaderImpl<LargeByteArrayType>;
 
 template <typename BAT>
 class ByteArrayDictionaryRecordReaderImpl : public TypedRecordReader<BAT>,
@@ -2176,7 +2178,8 @@ class ByteArrayDictionaryRecordReaderImpl : public TypedRecordReader<BAT>,
 
  public:
   ByteArrayDictionaryRecordReaderImpl(const ColumnDescriptor* descr, LevelInfo leaf_info,
-                                  ::arrow::MemoryPool* pool, bool read_dense_for_nullable)
+                                      ::arrow::MemoryPool* pool,
+                                      bool read_dense_for_nullable)
       : TypedRecordReader<BAT>(descr, leaf_info, pool, read_dense_for_nullable),
         builder_(pool) {
     this->read_dictionary_ = true;
@@ -2254,8 +2257,10 @@ class ByteArrayDictionaryRecordReaderImpl : public TypedRecordReader<BAT>,
   std::vector<std::shared_ptr<::arrow::Array>> result_chunks_;
 };
 
-using ByteArrayDictionaryRecordReader = ByteArrayDictionaryRecordReaderImpl<ByteArrayType>;
-using LargeByteArrayDictionaryRecordReader = ByteArrayDictionaryRecordReaderImpl<LargeByteArrayType>;
+using ByteArrayDictionaryRecordReader =
+    ByteArrayDictionaryRecordReaderImpl<ByteArrayType>;
+using LargeByteArrayDictionaryRecordReader =
+    ByteArrayDictionaryRecordReaderImpl<LargeByteArrayType>;
 
 // TODO(wesm): Implement these to some satisfaction
 template <>
@@ -2284,17 +2289,15 @@ std::shared_ptr<RecordReader> MakeByteArrayRecordReader(const ColumnDescriptor*
   }
 }
 
-std::shared_ptr<RecordReader> MakeLargeByteArrayRecordReader(const ColumnDescriptor* descr,
-                                                             LevelInfo leaf_info,
-                                                             ::arrow::MemoryPool* pool,
-                                                             bool read_dictionary,
-                                                             bool read_dense_for_nullable) {
+std::shared_ptr<RecordReader> MakeLargeByteArrayRecordReader(
+    const ColumnDescriptor* descr, LevelInfo leaf_info, ::arrow::MemoryPool* pool,
+    bool read_dictionary, bool read_dense_for_nullable) {
   if (read_dictionary) {
-    return std::make_shared<LargeByteArrayDictionaryRecordReader>(descr, leaf_info, pool,
-                                                             read_dense_for_nullable);
-  } else {
-    return std::make_shared<LargeByteArrayChunkedRecordReader>(
+    return std::make_shared<LargeByteArrayDictionaryRecordReader>(
         descr, leaf_info, pool, read_dense_for_nullable);
+  } else {
+    return std::make_shared<LargeByteArrayChunkedRecordReader>(descr, leaf_info, pool,
+                                                               read_dense_for_nullable);
   }
 }
 
@@ -2325,10 +2328,11 @@ std::shared_ptr<RecordReader> RecordReader::Make(const ColumnDescriptor* descr,
       return std::make_shared<TypedRecordReader<DoubleType>>(descr, leaf_info, pool,
                                                              read_dense_for_nullable);
     case Type::BYTE_ARRAY: {
-      return use_binary_string_large_variants ? MakeLargeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary,
-                                                                          read_dense_for_nullable)
-                                              : MakeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary,
-                                                                          read_dense_for_nullable);
+      return use_binary_string_large_variants
+                 ? MakeLargeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary,
+                                                  read_dense_for_nullable)
+                 : MakeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary,
+                                             read_dense_for_nullable);
     }
     case Type::FIXED_LEN_BYTE_ARRAY:
       return std::make_shared<FLBARecordReader>(descr, leaf_info, pool,
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index f4c35f424c0a9..b97950597e5f0 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -1242,20 +1242,17 @@ template <typename DType>
 struct ArrowBinaryHelperTraits;
 
 template <>
-struct ArrowBinaryHelperTraits<ByteArrayType>
-{
+struct ArrowBinaryHelperTraits<ByteArrayType> {
   static constexpr auto memory_limit = ::arrow::kBinaryMemoryLimit;
 };
 
 template <>
-struct ArrowBinaryHelperTraits<LargeByteArrayType>
-{
+struct ArrowBinaryHelperTraits<LargeByteArrayType> {
   static constexpr auto memory_limit = ::arrow::kLargeBinaryMemoryLimit;
 };
 
 template <typename BAT>
 struct ArrowBinaryHelperBase {
-
   explicit ArrowBinaryHelperBase(typename EncodingTraits<BAT>::Accumulator* out) {
     this->out = out;
     this->builder = out->builder.get();
@@ -1375,10 +1372,10 @@ class PlainByteArrayDecoderBase : public PlainDecoder<BAT>,
                                   virtual public TypedDecoder<BAT> {
  public:
   using Base = PlainDecoder<BAT>;
-  using Base::len_;
   using Base::data_;
-  using Base::num_values_;
   using Base::DecodeSpaced;
+  using Base::len_;
+  using Base::num_values_;
   using Base::PlainDecoder;
 
   // ----------------------------------------------------------------------
@@ -1525,10 +1522,9 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
   void SetDict(TypedDecoder<Type>* dictionary) override;
 
   template <typename T = Type,
-      typename = std::enable_if_t<std::is_same_v<T, ByteArrayType>
-          || std::is_same_v<T, LargeByteArrayType>>>
-  void SetByteArrayDict(TypedDecoder<Type>* dictionary)
-  {
+            typename = std::enable_if_t<std::is_same_v<T, ByteArrayType> ||
+                                        std::is_same_v<T, LargeByteArrayType>>>
+  void SetByteArrayDict(TypedDecoder<Type>* dictionary) {
     DecodeDict(dictionary);
 
     auto dict_values = reinterpret_cast<ByteArray*>(dictionary_->mutable_data());
@@ -1726,7 +1722,8 @@ void DictDecoderImpl<ByteArrayType>::SetDict(TypedDecoder<ByteArrayType>* dictio
 }
 
 template <>
-void DictDecoderImpl<LargeByteArrayType>::SetDict(TypedDecoder<LargeByteArrayType>* dictionary) {
+void DictDecoderImpl<LargeByteArrayType>::SetDict(
+    TypedDecoder<LargeByteArrayType>* dictionary) {
   SetByteArrayDict(dictionary);
 }
 
@@ -1922,7 +1919,8 @@ void DictDecoderImpl<ByteArrayType>::InsertDictionary(::arrow::ArrayBuilder* bui
 }
 
 template <>
-void DictDecoderImpl<LargeByteArrayType>::InsertDictionary(::arrow::ArrayBuilder* builder) {
+void DictDecoderImpl<LargeByteArrayType>::InsertDictionary(
+    ::arrow::ArrayBuilder* builder) {
   auto binary_builder = checked_cast<::arrow::LargeBinaryDictionary32Builder*>(builder);
 
   // Make a LargeBinaryArray referencing the internal dictionary data
@@ -1934,220 +1932,219 @@ void DictDecoderImpl<LargeByteArrayType>::InsertDictionary(::arrow::ArrayBuilder
 template <typename BAT = ByteArrayType>
 class DictByteArrayDecoderImpl : public DictDecoderImpl<BAT>,
                                  virtual public TypedDecoder<BAT> {
+ public:
+  using BASE = DictDecoderImpl<BAT>;
+  using BASE::DictDecoderImpl;
+  using BASE::dictionary_;
+  using BASE::idx_decoder_;
+  using BASE::IndexInBounds;
 
-   public:
-    using BASE = DictDecoderImpl<BAT>;
-    using BASE::DictDecoderImpl;
-    using BASE::dictionary_;
-    using BASE::idx_decoder_;
-    using BASE::IndexInBounds;
-
-    int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
-                    int64_t valid_bits_offset,
-                    typename EncodingTraits<BAT>::DictAccumulator* builder) override {
-      int result = 0;
-      if (null_count == 0) {
-        PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result));
-      } else {
-        PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
-                                         valid_bits_offset, builder, &result));
-      }
-      return result;
+  int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                  int64_t valid_bits_offset,
+                  typename EncodingTraits<BAT>::DictAccumulator* builder) override {
+    int result = 0;
+    if (null_count == 0) {
+      PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result));
+    } else {
+      PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
+                                       valid_bits_offset, builder, &result));
     }
+    return result;
+  }
 
-    int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
-                    int64_t valid_bits_offset,
-                    typename EncodingTraits<BAT>::Accumulator* out) override {
-      int result = 0;
-      if (null_count == 0) {
-        PARQUET_THROW_NOT_OK(DecodeArrowDenseNonNull(num_values, out, &result));
-      } else {
-        PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits,
-                                              valid_bits_offset, out, &result));
+  int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                  int64_t valid_bits_offset,
+                  typename EncodingTraits<BAT>::Accumulator* out) override {
+    int result = 0;
+    if (null_count == 0) {
+      PARQUET_THROW_NOT_OK(DecodeArrowDenseNonNull(num_values, out, &result));
+    } else {
+      PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits,
+                                            valid_bits_offset, out, &result));
+    }
+    return result;
+  }
+
+ private:
+  Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits,
+                          int64_t valid_bits_offset,
+                          typename EncodingTraits<BAT>::Accumulator* out,
+                          int* out_num_values) {
+    constexpr int32_t kBufferSize = 1024;
+    int32_t indices[kBufferSize];
+
+    ArrowBinaryHelperBase<BAT> helper(out);
+
+    auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
+    int values_decoded = 0;
+    int num_indices = 0;
+    int pos_indices = 0;
+
+    auto visit_valid = [&](int64_t position) -> Status {
+      if (num_indices == pos_indices) {
+        // Refill indices buffer
+        const auto batch_size =
+            std::min<int32_t>(kBufferSize, num_values - null_count - values_decoded);
+        num_indices = idx_decoder_.GetBatch(indices, batch_size);
+        if (ARROW_PREDICT_FALSE(num_indices < 1)) {
+          return Status::Invalid("Invalid number of indices: ", num_indices);
+        }
+        pos_indices = 0;
       }
-      return result;
-    }
-
-   private:
-    Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits,
-                            int64_t valid_bits_offset,
-                            typename EncodingTraits<BAT>::Accumulator* out,
-                            int* out_num_values) {
-      constexpr int32_t kBufferSize = 1024;
-      int32_t indices[kBufferSize];
-
-      ArrowBinaryHelperBase<BAT> helper(out);
-
-      auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
-      int values_decoded = 0;
-      int num_indices = 0;
-      int pos_indices = 0;
-
-      auto visit_valid = [&](int64_t position) -> Status {
-        if (num_indices == pos_indices) {
-          // Refill indices buffer
-          const auto batch_size =
-              std::min<int32_t>(kBufferSize, num_values - null_count - values_decoded);
-          num_indices = idx_decoder_.GetBatch(indices, batch_size);
-          if (ARROW_PREDICT_FALSE(num_indices < 1)) {
-            return Status::Invalid("Invalid number of indices: ", num_indices);
-          }
-          pos_indices = 0;
+      const auto index = indices[pos_indices++];
+      RETURN_NOT_OK(IndexInBounds(index));
+      const auto& val = dict_values[index];
+      if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) {
+        RETURN_NOT_OK(helper.PushChunk());
+      }
+      RETURN_NOT_OK(helper.Append(val.ptr, static_cast<int32_t>(val.len)));
+      ++values_decoded;
+      return Status::OK();
+    };
+
+    auto visit_null = [&]() -> Status {
+      RETURN_NOT_OK(helper.AppendNull());
+      return Status::OK();
+    };
+
+    ::arrow::internal::BitBlockCounter bit_blocks(valid_bits, valid_bits_offset,
+                                                  num_values);
+    int64_t position = 0;
+    while (position < num_values) {
+      const auto block = bit_blocks.NextWord();
+      if (block.AllSet()) {
+        for (int64_t i = 0; i < block.length; ++i, ++position) {
+          ARROW_RETURN_NOT_OK(visit_valid(position));
         }
-        const auto index = indices[pos_indices++];
-        RETURN_NOT_OK(IndexInBounds(index));
-        const auto& val = dict_values[index];
-        if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) {
-          RETURN_NOT_OK(helper.PushChunk());
+      } else if (block.NoneSet()) {
+        for (int64_t i = 0; i < block.length; ++i, ++position) {
+          ARROW_RETURN_NOT_OK(visit_null());
         }
-        RETURN_NOT_OK(helper.Append(val.ptr, static_cast<int32_t>(val.len)));
-        ++values_decoded;
-        return Status::OK();
-      };
-
-      auto visit_null = [&]() -> Status {
-        RETURN_NOT_OK(helper.AppendNull());
-        return Status::OK();
-      };
-
-      ::arrow::internal::BitBlockCounter bit_blocks(valid_bits, valid_bits_offset,
-                                                    num_values);
-      int64_t position = 0;
-      while (position < num_values) {
-        const auto block = bit_blocks.NextWord();
-        if (block.AllSet()) {
-          for (int64_t i = 0; i < block.length; ++i, ++position) {
+      } else {
+        for (int64_t i = 0; i < block.length; ++i, ++position) {
+          if (bit_util::GetBit(valid_bits, valid_bits_offset + position)) {
             ARROW_RETURN_NOT_OK(visit_valid(position));
-          }
-        } else if (block.NoneSet()) {
-          for (int64_t i = 0; i < block.length; ++i, ++position) {
+          } else {
             ARROW_RETURN_NOT_OK(visit_null());
           }
-        } else {
-          for (int64_t i = 0; i < block.length; ++i, ++position) {
-            if (bit_util::GetBit(valid_bits, valid_bits_offset + position)) {
-              ARROW_RETURN_NOT_OK(visit_valid(position));
-            } else {
-              ARROW_RETURN_NOT_OK(visit_null());
-            }
-          }
         }
       }
-
-      *out_num_values = values_decoded;
-      return Status::OK();
     }
 
-    Status DecodeArrowDenseNonNull(int num_values,
-                                   typename EncodingTraits<BAT>::Accumulator* out,
-                                   int* out_num_values) {
-      constexpr int32_t kBufferSize = 2048;
-      int32_t indices[kBufferSize];
-      int values_decoded = 0;
+    *out_num_values = values_decoded;
+    return Status::OK();
+  }
 
-      ArrowBinaryHelperBase<BAT> helper(out);
-      auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
+  Status DecodeArrowDenseNonNull(int num_values,
+                                 typename EncodingTraits<BAT>::Accumulator* out,
+                                 int* out_num_values) {
+    constexpr int32_t kBufferSize = 2048;
+    int32_t indices[kBufferSize];
+    int values_decoded = 0;
 
-      while (values_decoded < num_values) {
-        int32_t batch_size = std::min<int32_t>(kBufferSize, num_values - values_decoded);
-        int num_indices = idx_decoder_.GetBatch(indices, batch_size);
-        if (num_indices == 0) ParquetException::EofException();
-        for (int i = 0; i < num_indices; ++i) {
-          auto idx = indices[i];
-          RETURN_NOT_OK(IndexInBounds(idx));
-          const auto& val = dict_values[idx];
-          if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) {
-            RETURN_NOT_OK(helper.PushChunk());
-          }
-          RETURN_NOT_OK(helper.Append(val.ptr, static_cast<int32_t>(val.len)));
+    ArrowBinaryHelperBase<BAT> helper(out);
+    auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
+
+    while (values_decoded < num_values) {
+      int32_t batch_size = std::min<int32_t>(kBufferSize, num_values - values_decoded);
+      int num_indices = idx_decoder_.GetBatch(indices, batch_size);
+      if (num_indices == 0) ParquetException::EofException();
+      for (int i = 0; i < num_indices; ++i) {
+        auto idx = indices[i];
+        RETURN_NOT_OK(IndexInBounds(idx));
+        const auto& val = dict_values[idx];
+        if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) {
+          RETURN_NOT_OK(helper.PushChunk());
         }
-        values_decoded += num_indices;
+        RETURN_NOT_OK(helper.Append(val.ptr, static_cast<int32_t>(val.len)));
       }
-      *out_num_values = values_decoded;
-      return Status::OK();
+      values_decoded += num_indices;
     }
+    *out_num_values = values_decoded;
+    return Status::OK();
+  }
 
-    template <typename BuilderType>
-    Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
-                       int64_t valid_bits_offset, BuilderType* builder,
-                       int* out_num_values) {
-      constexpr int32_t kBufferSize = 1024;
-      int32_t indices[kBufferSize];
-
-      RETURN_NOT_OK(builder->Reserve(num_values));
-      ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
-
-      auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
-
-      int values_decoded = 0;
-      int num_appended = 0;
-      while (num_appended < num_values) {
-        bool is_valid = bit_reader.IsSet();
-        bit_reader.Next();
-
-        if (is_valid) {
-          int32_t batch_size =
-              std::min<int32_t>(kBufferSize, num_values - num_appended - null_count);
-          int num_indices = idx_decoder_.GetBatch(indices, batch_size);
-
-          int i = 0;
-          while (true) {
-            // Consume all indices
-            if (is_valid) {
-              auto idx = indices[i];
-              RETURN_NOT_OK(IndexInBounds(idx));
-              const auto& val = dict_values[idx];
-              RETURN_NOT_OK(builder->Append(val.ptr, val.len));
-              ++i;
-              ++values_decoded;
-            } else {
-              RETURN_NOT_OK(builder->AppendNull());
-              --null_count;
-            }
-            ++num_appended;
-            if (i == num_indices) {
-              // Do not advance the bit_reader if we have fulfilled the decode
-              // request
-              break;
-            }
-            is_valid = bit_reader.IsSet();
-            bit_reader.Next();
+  template <typename BuilderType>
+  Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                     int64_t valid_bits_offset, BuilderType* builder,
+                     int* out_num_values) {
+    constexpr int32_t kBufferSize = 1024;
+    int32_t indices[kBufferSize];
+
+    RETURN_NOT_OK(builder->Reserve(num_values));
+    ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
+
+    auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
+
+    int values_decoded = 0;
+    int num_appended = 0;
+    while (num_appended < num_values) {
+      bool is_valid = bit_reader.IsSet();
+      bit_reader.Next();
+
+      if (is_valid) {
+        int32_t batch_size =
+            std::min<int32_t>(kBufferSize, num_values - num_appended - null_count);
+        int num_indices = idx_decoder_.GetBatch(indices, batch_size);
+
+        int i = 0;
+        while (true) {
+          // Consume all indices
+          if (is_valid) {
+            auto idx = indices[i];
+            RETURN_NOT_OK(IndexInBounds(idx));
+            const auto& val = dict_values[idx];
+            RETURN_NOT_OK(builder->Append(val.ptr, val.len));
+            ++i;
+            ++values_decoded;
+          } else {
+            RETURN_NOT_OK(builder->AppendNull());
+            --null_count;
           }
-        } else {
-          RETURN_NOT_OK(builder->AppendNull());
-          --null_count;
           ++num_appended;
+          if (i == num_indices) {
+            // Do not advance the bit_reader if we have fulfilled the decode
+            // request
+            break;
+          }
+          is_valid = bit_reader.IsSet();
+          bit_reader.Next();
         }
+      } else {
+        RETURN_NOT_OK(builder->AppendNull());
+        --null_count;
+        ++num_appended;
       }
-      *out_num_values = values_decoded;
-      return Status::OK();
     }
+    *out_num_values = values_decoded;
+    return Status::OK();
+  }
 
-    template <typename BuilderType>
-    Status DecodeArrowNonNull(int num_values, BuilderType* builder, int* out_num_values) {
-      constexpr int32_t kBufferSize = 2048;
-      int32_t indices[kBufferSize];
+  template <typename BuilderType>
+  Status DecodeArrowNonNull(int num_values, BuilderType* builder, int* out_num_values) {
+    constexpr int32_t kBufferSize = 2048;
+    int32_t indices[kBufferSize];
 
-      RETURN_NOT_OK(builder->Reserve(num_values));
+    RETURN_NOT_OK(builder->Reserve(num_values));
 
-      auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
+    auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
 
-      int values_decoded = 0;
-      while (values_decoded < num_values) {
-        int32_t batch_size = std::min<int32_t>(kBufferSize, num_values - values_decoded);
-        int num_indices = idx_decoder_.GetBatch(indices, batch_size);
-        if (num_indices == 0) ParquetException::EofException();
-        for (int i = 0; i < num_indices; ++i) {
-          auto idx = indices[i];
-          RETURN_NOT_OK(IndexInBounds(idx));
-          const auto& val = dict_values[idx];
-          RETURN_NOT_OK(builder->Append(val.ptr, val.len));
-        }
-        values_decoded += num_indices;
+    int values_decoded = 0;
+    while (values_decoded < num_values) {
+      int32_t batch_size = std::min<int32_t>(kBufferSize, num_values - values_decoded);
+      int num_indices = idx_decoder_.GetBatch(indices, batch_size);
+      if (num_indices == 0) ParquetException::EofException();
+      for (int i = 0; i < num_indices; ++i) {
+        auto idx = indices[i];
+        RETURN_NOT_OK(IndexInBounds(idx));
+        const auto& val = dict_values[idx];
+        RETURN_NOT_OK(builder->Append(val.ptr, val.len));
       }
-      *out_num_values = values_decoded;
-      return Status::OK();
+      values_decoded += num_indices;
     }
+    *out_num_values = values_decoded;
+    return Status::OK();
+  }
 };
 
 using DictLargeByteArrayDecoderImpl = DictByteArrayDecoderImpl<LargeByteArrayType>;
@@ -3506,7 +3503,8 @@ std::unique_ptr<Encoder> MakeEncoder(Type::type type_num, Encoding::type encodin
 
 std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encoding,
                                      const ColumnDescriptor* descr,
-                                     ::arrow::MemoryPool* pool, bool use_large_binary_variants) {
+                                     ::arrow::MemoryPool* pool,
+                                     bool use_large_binary_variants) {
   if (encoding == Encoding::PLAIN) {
     switch (type_num) {
       case Type::BOOLEAN:
@@ -3575,8 +3573,7 @@ std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encodin
 
 namespace detail {
 std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
-                                         const ColumnDescriptor* descr,
-                                         MemoryPool* pool,
+                                         const ColumnDescriptor* descr, MemoryPool* pool,
                                          bool use_large_binary_variants) {
   switch (type_num) {
     case Type::BOOLEAN:
diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h
index af5425fdc54b3..f61c5e5b642d2 100644
--- a/cpp/src/parquet/encoding.h
+++ b/cpp/src/parquet/encoding.h
@@ -456,7 +456,8 @@ std::unique_ptr<typename EncodingTraits<DType>::Encoder> MakeTypedEncoder(
 PARQUET_EXPORT
 std::unique_ptr<Decoder> MakeDecoder(
     Type::type type_num, Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR,
-    ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), bool use_large_binary_variants = false);
+    ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
+    bool use_large_binary_variants = false);
 
 namespace detail {
 
@@ -473,7 +474,8 @@ std::unique_ptr<DictDecoder<DType>> MakeDictDecoder(
     const ColumnDescriptor* descr = NULLPTR,
     ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
   using OutType = DictDecoder<DType>;
-  auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool, std::is_same_v<DType, LargeByteArrayType>);
+  auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool,
+                                         std::is_same_v<DType, LargeByteArrayType>);
   return std::unique_ptr<OutType>(dynamic_cast<OutType*>(decoder.release()));
 }
 
@@ -483,7 +485,8 @@ std::unique_ptr<typename EncodingTraits<DType>::Decoder> MakeTypedDecoder(
     ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
   using OutType = typename EncodingTraits<DType>::Decoder;
 
-  std::unique_ptr<Decoder> base = MakeDecoder(DType::type_num, encoding, descr, pool, std::is_same_v<DType, LargeByteArrayType>);
+  std::unique_ptr<Decoder> base = MakeDecoder(DType::type_num, encoding, descr, pool,
+                                              std::is_same_v<DType, LargeByteArrayType>);
   return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
 }
 
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index 4e55b50375d7e..e59b2e4e84254 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -856,9 +856,7 @@ class PARQUET_EXPORT ArrowReaderProperties {
     use_large_binary_variants_ = use_large_binary_variants;
   }
 
-  bool use_large_binary_variants() const {
-    return use_large_binary_variants_;
-  }
+  bool use_large_binary_variants() const { return use_large_binary_variants_; }
 
  private:
   bool use_threads_;
diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h
index f24cad9e87bcc..73a25cbf1e131 100644
--- a/cpp/src/parquet/types.h
+++ b/cpp/src/parquet/types.h
@@ -767,8 +767,7 @@ using ByteArrayType = PhysicalType<Type::BYTE_ARRAY>;
  * will not exceed 2^31 - 1. However, arrow supports StringType/BinaryType and their
  * large variants (i.e. LargeStringType and LargeBinaryType).
  * */
-struct LargeByteArrayType : public ByteArrayType
-{};
+struct LargeByteArrayType : public ByteArrayType {};
 
 using FLBAType = PhysicalType<Type::FIXED_LEN_BYTE_ARRAY>;
 

From fd8f979bb006e2e47da03d92062031090a892903 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Fri, 9 Jun 2023 09:19:30 -0300
Subject: [PATCH 40/69] removed todos

---
 cpp/src/parquet/page_index.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc
index f3bca027dac5b..d29cc33eb5afd 100644
--- a/cpp/src/parquet/page_index.cc
+++ b/cpp/src/parquet/page_index.cc
@@ -853,7 +853,6 @@ std::unique_ptr<ColumnIndex> ColumnIndex::Make(const ColumnDescriptor& descr,
       return std::make_unique<TypedColumnIndexImpl<DoubleType>>(descr, column_index);
     case Type::BYTE_ARRAY:
       return std::make_unique<TypedColumnIndexImpl<ByteArrayType>>(descr, column_index);
-      // TODO AP FIX ARTHUR PASSOS
     case Type::FIXED_LEN_BYTE_ARRAY:
       return std::make_unique<TypedColumnIndexImpl<FLBAType>>(descr, column_index);
     case Type::UNDEFINED:
@@ -898,7 +897,6 @@ std::unique_ptr<ColumnIndexBuilder> ColumnIndexBuilder::Make(
       return std::make_unique<ColumnIndexBuilderImpl<DoubleType>>(descr);
     case Type::BYTE_ARRAY:
       return std::make_unique<ColumnIndexBuilderImpl<ByteArrayType>>(descr);
-      // TODO AP FIX ARTHUR PASSOS
     case Type::FIXED_LEN_BYTE_ARRAY:
       return std::make_unique<ColumnIndexBuilderImpl<FLBAType>>(descr);
     case Type::UNDEFINED:

From a5736d5ca39791db97b44f966fc1ff9661ea2647 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Fri, 9 Jun 2023 09:20:33 -0300
Subject: [PATCH 41/69] a bit more renaming

---
 cpp/src/parquet/column_reader.cc | 4 ++--
 cpp/src/parquet/column_reader.h  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 6da925bc71ba0..0bc7329dbd456 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -2307,7 +2307,7 @@ std::shared_ptr<RecordReader> RecordReader::Make(const ColumnDescriptor* descr,
                                                  LevelInfo leaf_info, MemoryPool* pool,
                                                  bool read_dictionary,
                                                  bool read_dense_for_nullable,
-                                                 bool use_binary_string_large_variants) {
+                                                 bool use_large_binary_variants) {
   switch (descr->physical_type()) {
     case Type::BOOLEAN:
       return std::make_shared<TypedRecordReader<BooleanType>>(descr, leaf_info, pool,
@@ -2328,7 +2328,7 @@ std::shared_ptr<RecordReader> RecordReader::Make(const ColumnDescriptor* descr,
       return std::make_shared<TypedRecordReader<DoubleType>>(descr, leaf_info, pool,
                                                              read_dense_for_nullable);
     case Type::BYTE_ARRAY: {
-      return use_binary_string_large_variants
+      return use_large_binary_variants
                  ? MakeLargeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary,
                                                   read_dense_for_nullable)
                  : MakeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary,
diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index 471117c1f13e6..7e938310a9839 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -322,7 +322,7 @@ class PARQUET_EXPORT RecordReader {
       const ColumnDescriptor* descr, LevelInfo leaf_info,
       ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
       bool read_dictionary = false, bool read_dense_for_nullable = false,
-      bool use_binary_string_large_variants = false);
+      bool use_large_binary_variants = false);
 
   virtual ~RecordReader() = default;
 

From b4ecd0d601cd3a1ef09fb5e6ccad8b07208665a8 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Fri, 9 Jun 2023 09:45:31 -0300
Subject: [PATCH 42/69] address one mor comment

---
 cpp/src/parquet/arrow/reader.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index e8a3f79aa0e6d..c74a93f419e5c 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -1220,6 +1220,7 @@ Status FileReaderImpl::GetColumn(int i, FileColumnIteratorFactory iterator_facto
   ctx->pool = pool_;
   ctx->iterator_factory = iterator_factory;
   ctx->filter_leaves = false;
+  ctx->use_large_binary_variants = reader_properties_.use_large_binary_variants();
   std::unique_ptr<ColumnReaderImpl> result;
   RETURN_NOT_OK(GetReader(manifest_.schema_fields[i], ctx, &result));
   *out = std::move(result);

From 9e9dff9b416a6b03c687b161cea5e7711a9382a1 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Fri, 9 Jun 2023 13:19:20 -0300
Subject: [PATCH 43/69] add overflow check in dict

---
 cpp/src/parquet/encoding.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index b97950597e5f0..a5b99c256829f 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -1531,7 +1531,9 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
 
     int total_size = 0;
     for (int i = 0; i < dictionary_length_; ++i) {
-      total_size += dict_values[i].len;
+      if (AddWithOverflow(total_size, dict_values[i].len, &total_size)) {
+        throw ParquetException("String/Binary Length to large");
+      }
     }
     PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,
                                                   /*shrink_to_fit=*/false));

From ae1db20cda7c5515d2ab21782838102f162465e0 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Mon, 12 Jun 2023 08:47:54 -0300
Subject: [PATCH 44/69] address a few comments

---
 cpp/src/parquet/encoding.cc |  2 +-
 cpp/src/parquet/types.h     | 10 +++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index a5b99c256829f..9b0ccf14d255d 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -1532,7 +1532,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
     int total_size = 0;
     for (int i = 0; i < dictionary_length_; ++i) {
       if (AddWithOverflow(total_size, dict_values[i].len, &total_size)) {
-        throw ParquetException("String/Binary Length to large");
+        throw ParquetException("String/Binary length to large");
       }
     }
     PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,
diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h
index 73a25cbf1e131..11eb0e703b7a7 100644
--- a/cpp/src/parquet/types.h
+++ b/cpp/src/parquet/types.h
@@ -763,9 +763,13 @@ using DoubleType = PhysicalType<Type::DOUBLE>;
 using ByteArrayType = PhysicalType<Type::BYTE_ARRAY>;
 
 /*
- * Parquet uses ByteArrayType for variable length strings and binaries and their lengths
- * will not exceed 2^31 - 1. However, arrow supports StringType/BinaryType and their
- * large variants (i.e. LargeStringType and LargeBinaryType).
+ * Parquet has defined ByteArrayType for variable length string and binary values with a
+ * maximum length of 2^31 - 1. By default, arrow StringType and BinaryType are used to
+ * map parquet ByteArrayType. However, arrow StringArray/BinaryArray uses int32_t to
+ * store the offset of each string/binary value in a concatenated buffer which may
+ * overflow (though unlikely in most cases). As arrow has defined LargeStringType and
+ * LargeBinaryType which use int64_t as the offset type, we define LargeByteArrayType
+ * below to indicate parquet reader/writer to use those large variants from arrow.
  * */
 struct LargeByteArrayType : public ByteArrayType {};
 

From 09a9eaf3c9d946b3fcfb0ce97947af452396437e Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Wed, 14 Jun 2023 09:21:37 -0300
Subject: [PATCH 45/69] use int32_t explicitly

---
 cpp/src/parquet/encoding.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 9b0ccf14d255d..20fffd2fa4182 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -1529,7 +1529,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
 
     auto dict_values = reinterpret_cast<ByteArray*>(dictionary_->mutable_data());
 
-    int total_size = 0;
+    int32_t total_size = 0;
     for (int i = 0; i < dictionary_length_; ++i) {
       if (AddWithOverflow(total_size, dict_values[i].len, &total_size)) {
         throw ParquetException("String/Binary length to large");

From 1664983da76efa09ed8983ce0a6ab3e34b00f15f Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Wed, 14 Jun 2023 14:05:00 -0300
Subject: [PATCH 46/69] use template directly

---
 cpp/src/parquet/encoding.cc | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 20fffd2fa4182..bf6e00763f699 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -2149,8 +2149,6 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl<BAT>,
   }
 };
 
-using DictLargeByteArrayDecoderImpl = DictByteArrayDecoderImpl<LargeByteArrayType>;
-
 // ----------------------------------------------------------------------
 // DeltaBitPackEncoder
 
@@ -3592,9 +3590,9 @@ std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
       return std::make_unique<DictDecoderImpl<DoubleType>>(descr, pool);
     case Type::BYTE_ARRAY:
       if (use_large_binary_variants) {
-        return std::make_unique<DictLargeByteArrayDecoderImpl>(descr, pool);
+        return std::make_unique<DictByteArrayDecoderImpl<LargeByteArrayType>>(descr, pool);
       } else {
-        return std::make_unique<DictByteArrayDecoderImpl<>>(descr, pool);
+        return std::make_unique<DictByteArrayDecoderImpl<ByteArrayType>>(descr, pool);
       }
     case Type::FIXED_LEN_BYTE_ARRAY:
       return std::make_unique<DictDecoderImpl<FLBAType>>(descr, pool);

From 322319edea2828410c139c808362f78ea686b04e Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Thu, 15 Jun 2023 14:14:15 -0300
Subject: [PATCH 47/69] use offset_type

---
 cpp/src/parquet/encoding.cc | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index bf6e00763f699..4d8867e4eccf8 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -1529,6 +1529,8 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
 
     auto dict_values = reinterpret_cast<ByteArray*>(dictionary_->mutable_data());
 
+    using offset_type = typename EncodingTraits<Type>::ArrowType::offset_type;
+
     int32_t total_size = 0;
     for (int i = 0; i < dictionary_length_; ++i) {
       if (AddWithOverflow(total_size, dict_values[i].len, &total_size)) {
@@ -1538,13 +1540,13 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
     PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,
                                                   /*shrink_to_fit=*/false));
     PARQUET_THROW_NOT_OK(
-        byte_array_offsets_->Resize((dictionary_length_ + 1) * sizeof(int32_t),
+        byte_array_offsets_->Resize((dictionary_length_ + 1) * sizeof(offset_type),
                                     /*shrink_to_fit=*/false));
 
-    int32_t offset = 0;
+    offset_type offset = 0;
     uint8_t* bytes_data = byte_array_data_->mutable_data();
-    int32_t* bytes_offsets =
-        reinterpret_cast<int32_t*>(byte_array_offsets_->mutable_data());
+    auto* bytes_offsets =
+        reinterpret_cast<offset_type*>(byte_array_offsets_->mutable_data());
     for (int i = 0; i < dictionary_length_; ++i) {
       memcpy(bytes_data + offset, dict_values[i].ptr, dict_values[i].len);
       bytes_offsets[i] = offset;

From 1775a7a02923aec51bc77cab948da1f598e17324 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Thu, 15 Jun 2023 14:50:54 -0300
Subject: [PATCH 48/69] address comments

---
 cpp/src/parquet/arrow/reader_internal.cc | 5 +++--
 cpp/src/parquet/encoding.cc              | 6 ++----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc
index a294b712a7ce3..2dbe923a9d43c 100644
--- a/cpp/src/parquet/arrow/reader_internal.cc
+++ b/cpp/src/parquet/arrow/reader_internal.cc
@@ -487,8 +487,9 @@ Status TransferBinary(RecordReader* reader, MemoryPool* pool,
   auto chunks = binary_reader->GetBuilderChunks();
   for (auto& chunk : chunks) {
     if (!chunk->type()->Equals(*logical_type_field->type())) {
-      // XXX: if a LargeBinary chunk is larger than 2GB, the MSBs of offsets
-      // will be lost because they are first created as int32 and then cast to int64.
+      // XXX: if a LargeBinary chunk is larger than 2GB and use_large_binary_variants
+      // is not set, the MSBs of offsets will be lost because they are first created
+      // as int32 and then cast to int64.
       ARROW_ASSIGN_OR_RAISE(
           chunk,
           ::arrow::compute::Cast(*chunk, logical_type_field->type(), cast_options, &ctx));
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 4d8867e4eccf8..3aaedce7bc51e 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -1531,11 +1531,9 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
 
     using offset_type = typename EncodingTraits<Type>::ArrowType::offset_type;
 
-    int32_t total_size = 0;
+    offset_type total_size = 0;
     for (int i = 0; i < dictionary_length_; ++i) {
-      if (AddWithOverflow(total_size, dict_values[i].len, &total_size)) {
-        throw ParquetException("String/Binary length to large");
-      }
+      total_size += dict_values[i].len;
     }
     PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,
                                                   /*shrink_to_fit=*/false));

From 7f6e2bf58c69cefb9d752cb83d9c8f391ad1869c Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Fri, 16 Jun 2023 08:12:46 -0300
Subject: [PATCH 49/69] address a few minor comments

---
 cpp/src/parquet/arrow/reader_internal.cc | 2 +-
 cpp/src/parquet/column_reader.cc         | 5 ++---
 cpp/src/parquet/encoding.cc              | 4 +++-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc
index 2dbe923a9d43c..a1c40df747706 100644
--- a/cpp/src/parquet/arrow/reader_internal.cc
+++ b/cpp/src/parquet/arrow/reader_internal.cc
@@ -487,7 +487,7 @@ Status TransferBinary(RecordReader* reader, MemoryPool* pool,
   auto chunks = binary_reader->GetBuilderChunks();
   for (auto& chunk : chunks) {
     if (!chunk->type()->Equals(*logical_type_field->type())) {
-      // XXX: if a LargeBinary chunk is larger than 2GB and use_large_binary_variants
+      // If a LargeBinary chunk is larger than 2GB and use_large_binary_variants
       // is not set, the MSBs of offsets will be lost because they are first created
       // as int32 and then cast to int64.
       ARROW_ASSIGN_OR_RAISE(
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 0bc7329dbd456..cf2511db04530 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -2093,11 +2093,10 @@ class FLBARecordReader : public TypedRecordReader<FLBAType>,
   std::unique_ptr<::arrow::FixedSizeBinaryBuilder> builder_;
 };
 
-// TODO Below concept could be used to simplify type assertion,
-//  but it requires c++20
+// TODO: Below concept could be used to simplify type assertion in C++20.
 // template <typename T>
 // concept ByteArrayTypeConcept = std::is_same<T, ByteArrayType>::value ||
-//                               std::is_same<T, LargeByteArrayType>::value;
+//                                std::is_same<T, LargeByteArrayType>::value;
 
 template <typename T>
 struct IsByteArrayType : std::false_type {};
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 3aaedce7bc51e..b81fc68be8d55 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -1533,7 +1533,9 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
 
     offset_type total_size = 0;
     for (int i = 0; i < dictionary_length_; ++i) {
-      total_size += dict_values[i].len;
+      if (AddWithOverflow(total_size, dict_values[i].len, &total_size)) {
+        throw ParquetException("String/Binary length to large");
+      }
     }
     PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,
                                                   /*shrink_to_fit=*/false));

From 75fb61559fb269ee7816fa4610fe98d88cbcc18e Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Fri, 16 Jun 2023 09:54:53 -0300
Subject: [PATCH 50/69] fix DictDecoderImpl

---
 .../parquet/arrow/arrow_reader_writer_test.cc |  3 ++
 cpp/src/parquet/arrow/schema.cc               |  3 +-
 cpp/src/parquet/encoding.cc                   | 35 ++++++++++++++-----
 3 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index 7fa6d23414756..b61a0c0affe21 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -3873,6 +3873,9 @@ TEST(TestArrowParquet, LargeByteArray) {
   TryReadDataFile(path, ::arrow::StatusCode::NotImplemented);
   ArrowReaderProperties reader_properties;
   reader_properties.set_use_large_binary_variants(true);
+  reader_properties.set_read_dictionary(0, false);
+  TryReadDataFileWithProperties(path, reader_properties);
+  reader_properties.set_read_dictionary(0, true);
   TryReadDataFileWithProperties(path, reader_properties);
 }
 
diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
index 799c9a244ff43..445bc017f5b30 100644
--- a/cpp/src/parquet/arrow/schema.cc
+++ b/cpp/src/parquet/arrow/schema.cc
@@ -462,7 +462,8 @@ struct SchemaTreeContext {
 
 bool IsDictionaryReadSupported(const ArrowType& type) {
   // Only supported currently for BYTE_ARRAY types
-  return type.id() == ::arrow::Type::BINARY || type.id() == ::arrow::Type::STRING;
+  return type.id() == ::arrow::Type::BINARY || type.id() == ::arrow::Type::STRING
+      || type.id() == ::arrow::Type::LARGE_BINARY || type.id() == ::arrow::Type::LARGE_STRING;
 }
 
 // ----------------------------------------------------------------------
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index b81fc68be8d55..827309646b7f3 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -1631,11 +1631,19 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
         valid_bits, valid_bits_offset, num_values, null_count,
         [&]() { valid_bytes[i++] = 1; }, [&]() { ++i; });
 
-    auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder);
-    PARQUET_THROW_NOT_OK(
-        binary_builder->AppendIndices(indices_buffer, num_values, valid_bytes.data()));
-    num_values_ -= num_values - null_count;
-    return num_values - null_count;
+    // It looks like this method is only called by ByteArray types. Previously,
+    // there was an unconditional cast to ::arrow::Dictionary32Builder<::arrow::BinaryType>.
+    // This won't work for LargeByteArrayType and the Type template argument can't be used
+    // unconditionally because it is not defined for several other types.
+    if constexpr (std::is_same_v<ByteArrayType, Type> || std::is_same_v<LargeByteArrayType, Type>) {
+      auto binary_builder = checked_cast<typename EncodingTraits<Type>::DictAccumulator*>(builder);
+      PARQUET_THROW_NOT_OK(
+          binary_builder->AppendIndices(indices_buffer, num_values, valid_bytes.data()));
+      num_values_ -= num_values - null_count;
+      return num_values - null_count;
+    }
+
+    ParquetException::NYI("DecodeIndicesSpaced not implemented for this type");
   }
 
   int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) override {
@@ -1652,10 +1660,19 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
     if (num_values != idx_decoder_.GetBatch(indices_buffer, num_values)) {
       ParquetException::EofException();
     }
-    auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder);
-    PARQUET_THROW_NOT_OK(binary_builder->AppendIndices(indices_buffer, num_values));
-    num_values_ -= num_values;
-    return num_values;
+
+    // It looks like this method is only called by ByteArray types. Previously,
+    // there was an unconditional cast to ::arrow::Dictionary32Builder<::arrow::BinaryType>.
+    // This won't work for LargeByteArrayType and the Type template argument can't be used
+    // unconditionally because it is not defined for several other types.
+    if constexpr (std::is_same_v<ByteArrayType, Type> || std::is_same_v<LargeByteArrayType, Type>) {
+      auto binary_builder = checked_cast<typename EncodingTraits<Type>::DictAccumulator*>(builder);
+      PARQUET_THROW_NOT_OK(binary_builder->AppendIndices(indices_buffer, num_values));
+      num_values_ -= num_values;
+      return num_values;
+    }
+
+    ParquetException::NYI("DecodeIndices not implemented for this type");
   }
 
   int DecodeIndices(int num_values, int32_t* indices) override {

From 0801267dbea9001e53908ebb40c82a263582c458 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Fri, 16 Jun 2023 14:03:33 -0300
Subject: [PATCH 51/69] add non overflow test

---
 .../parquet/arrow/arrow_reader_writer_test.cc | 36 ++++++++++++++++---
 cpp/src/parquet/arrow/reader.cc               |  8 ++++-
 cpp/src/parquet/arrow/reader.h                |  6 ++++
 3 files changed, 44 insertions(+), 6 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index b61a0c0affe21..954045b74d1ad 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -439,7 +439,9 @@ void DoSimpleRoundtrip(const std::shared_ptr<Table>& table, bool use_threads,
                        int64_t row_group_size, const std::vector<int>& column_subset,
                        std::shared_ptr<Table>* out,
                        const std::shared_ptr<ArrowWriterProperties>& arrow_properties =
-                           default_arrow_writer_properties()) {
+                           default_arrow_writer_properties(),
+                       const ArrowReaderProperties& arrow_reader_properties =
+                           default_arrow_reader_properties()) {
   std::shared_ptr<Buffer> buffer;
   ASSERT_NO_FATAL_FAILURE(
       WriteTableToBuffer(table, row_group_size, arrow_properties, &buffer));
@@ -491,11 +493,14 @@ void DoRoundTripWithBatches(
 void CheckSimpleRoundtrip(
     const std::shared_ptr<Table>& table, int64_t row_group_size,
     const std::shared_ptr<ArrowWriterProperties>& arrow_writer_properties =
-        default_arrow_writer_properties()) {
+        default_arrow_writer_properties(),
+    const ArrowReaderProperties& arrow_reader_properties =
+        default_arrow_reader_properties()) {
   std::shared_ptr<Table> result;
   ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip(table, false /* use_threads */,
                                             row_group_size, {}, &result,
-                                            arrow_writer_properties));
+                                            arrow_writer_properties,
+                                            arrow_reader_properties));
   ::arrow::AssertSchemaEqual(*table->schema(), *result->schema(),
                              /*check_metadata=*/false);
   ASSERT_OK(result->ValidateFull());
@@ -610,9 +615,14 @@ class ParquetIOTestBase : public ::testing::Test {
   }
 
   void ReaderFromSink(std::unique_ptr<FileReader>* out) {
+    return ReaderFromSink(out, default_arrow_reader_properties());
+  }
+
+  void ReaderFromSink(std::unique_ptr<FileReader>* out,
+                      const ArrowReaderProperties& arrow_reader_properties) {
     ASSERT_OK_AND_ASSIGN(auto buffer, sink_->Finish());
     ASSERT_OK_NO_THROW(OpenFile(std::make_shared<BufferReader>(buffer),
-                                ::arrow::default_memory_pool(), out));
+                                ::arrow::default_memory_pool(), arrow_reader_properties, out));
   }
 
   void ReadSingleColumnFile(std::unique_ptr<FileReader> file_reader,
@@ -661,16 +671,18 @@ class ParquetIOTestBase : public ::testing::Test {
   void RoundTripSingleColumn(
       const std::shared_ptr<Array>& values, const std::shared_ptr<Array>& expected,
       const std::shared_ptr<::parquet::ArrowWriterProperties>& arrow_properties,
+      const ArrowReaderProperties& arrow_reader_properties = default_arrow_reader_properties(),
       bool nullable = true) {
     std::shared_ptr<Table> table = MakeSimpleTable(values, nullable);
     this->ResetSink();
+
     ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), this->sink_,
                                   values->length(), default_writer_properties(),
                                   arrow_properties));
 
     std::shared_ptr<Table> out;
     std::unique_ptr<FileReader> reader;
-    ASSERT_NO_FATAL_FAILURE(this->ReaderFromSink(&reader));
+    ASSERT_NO_FATAL_FAILURE(this->ReaderFromSink(&reader, arrow_reader_properties));
     const bool expect_metadata = arrow_properties->store_schema();
     ASSERT_NO_FATAL_FAILURE(
         this->ReadTableFromFile(std::move(reader), expect_metadata, &out));
@@ -709,6 +721,12 @@ class ParquetIOTestBase : public ::testing::Test {
     CheckSimpleRoundtrip(table, table->num_rows());
   }
 
+  void CheckRoundTrip(const std::shared_ptr<Table>& table,
+                      const std::shared_ptr<ArrowWriterProperties>& arrow_writer_properties,
+                      const ArrowReaderProperties& arrow_reader_properties) {
+    CheckSimpleRoundtrip(table, table->num_rows(), arrow_writer_properties, arrow_reader_properties);
+  }
+
   template <typename ArrayType>
   void WriteColumn(const std::shared_ptr<GroupNode>& schema,
                    const std::shared_ptr<ArrayType>& values) {
@@ -1388,6 +1406,14 @@ TEST_F(TestLargeBinaryParquetIO, Basics) {
   const auto arrow_properties =
       ::parquet::ArrowWriterProperties::Builder().store_schema()->build();
   this->RoundTripSingleColumn(large_array, large_array, arrow_properties);
+
+  ArrowReaderProperties arrow_reader_properties;
+  arrow_reader_properties.set_use_large_binary_variants(true);
+  // Input is narrow array, but expected output is large array, opposite of the above tests.
+  // This validates narrow arrays can be read as large arrays.
+  this->RoundTripSingleColumn(narrow_array, large_array,
+                              default_arrow_writer_properties(),
+                              arrow_reader_properties);
 }
 
 using TestLargeStringParquetIO = TestParquetIO<::arrow::LargeStringType>;
diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index c74a93f419e5c..87ec4cc9141f6 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -1374,9 +1374,15 @@ Result<std::unique_ptr<FileReader>> FileReaderBuilder::Build() {
 
 Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile> file, MemoryPool* pool,
                 std::unique_ptr<FileReader>* reader) {
+  return OpenFile(std::move(file), pool, default_arrow_reader_properties(), reader);
+}
+
+Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile> file, MemoryPool* pool,
+                const ArrowReaderProperties& arrow_reader_properties,
+                std::unique_ptr<FileReader>* reader) {
   FileReaderBuilder builder;
   RETURN_NOT_OK(builder.Open(std::move(file)));
-  return builder.memory_pool(pool)->Build(reader);
+  return builder.properties(arrow_reader_properties)->memory_pool(pool)->Build(reader);
 }
 
 namespace internal {
diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h
index 2cbd36176f5e3..0cfa6eb464927 100644
--- a/cpp/src/parquet/arrow/reader.h
+++ b/cpp/src/parquet/arrow/reader.h
@@ -369,6 +369,12 @@ ::arrow::Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile>,
                          ::arrow::MemoryPool* allocator,
                          std::unique_ptr<FileReader>* reader);
 
+PARQUET_EXPORT
+::arrow::Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile>,
+                         ::arrow::MemoryPool* allocator,
+                         const ArrowReaderProperties& arrow_reader_properties,
+                         std::unique_ptr<FileReader>* reader);
+
 /// @}
 
 PARQUET_EXPORT

From 7f09a160730425d933a433104f09782a50633615 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Mon, 19 Jun 2023 13:19:16 -0300
Subject: [PATCH 52/69] string test

---
 .../parquet/arrow/arrow_reader_writer_test.cc | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index 954045b74d1ad..ea709276a2a49 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -1360,6 +1360,31 @@ TEST_F(TestUInt32ParquetIO, Parquet_1_0_Compatibility) {
 
 using TestStringParquetIO = TestParquetIO<::arrow::StringType>;
 
+TEST_F(TestStringParquetIO, Basics) {
+  std::shared_ptr<Array> values;
+
+  ::arrow::StringBuilder builder;
+  for (size_t i = 0; i < SMALL_SIZE; i++) {
+    ASSERT_OK(builder.Append("abc"));
+  }
+  ASSERT_OK(builder.Finish(&values));
+
+  // Input is narrow array, but expected output is large array, opposite of the above tests.
+  // This validates narrow arrays can be read as large arrays.
+  this->RoundTripSingleColumn(values, values,
+                              default_arrow_writer_properties());
+
+  ArrowReaderProperties arrow_reader_properties;
+  arrow_reader_properties.set_use_large_binary_variants(true);
+
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> casted,
+                       ::arrow::compute::Cast(*values, ::arrow::large_utf8()));
+
+  this->RoundTripSingleColumn(values, casted,
+                              default_arrow_writer_properties(),
+                              arrow_reader_properties);
+}
+
 TEST_F(TestStringParquetIO, EmptyStringColumnRequiredWrite) {
   std::shared_ptr<Array> values;
   ::arrow::StringBuilder builder;

From a8d20a44552c8d9ca27f16c3ccf3cd4525a9bcdb Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Tue, 20 Jun 2023 14:21:52 -0300
Subject: [PATCH 53/69] address minor comments

---
 cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index ea709276a2a49..f575aa3908633 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -438,13 +438,13 @@ void CheckConfiguredRoundtrip(
 void DoSimpleRoundtrip(const std::shared_ptr<Table>& table, bool use_threads,
                        int64_t row_group_size, const std::vector<int>& column_subset,
                        std::shared_ptr<Table>* out,
-                       const std::shared_ptr<ArrowWriterProperties>& arrow_properties =
+                       const std::shared_ptr<ArrowWriterProperties>& arrow_writer_properties =
                            default_arrow_writer_properties(),
                        const ArrowReaderProperties& arrow_reader_properties =
                            default_arrow_reader_properties()) {
   std::shared_ptr<Buffer> buffer;
   ASSERT_NO_FATAL_FAILURE(
-      WriteTableToBuffer(table, row_group_size, arrow_properties, &buffer));
+      WriteTableToBuffer(table, row_group_size, arrow_writer_properties, &buffer));
 
   std::unique_ptr<FileReader> reader;
   ASSERT_OK_NO_THROW(OpenFile(std::make_shared<BufferReader>(buffer),
@@ -670,20 +670,19 @@ class ParquetIOTestBase : public ::testing::Test {
 
   void RoundTripSingleColumn(
       const std::shared_ptr<Array>& values, const std::shared_ptr<Array>& expected,
-      const std::shared_ptr<::parquet::ArrowWriterProperties>& arrow_properties,
+      const std::shared_ptr<::parquet::ArrowWriterProperties>& arrow_writer_properties,
       const ArrowReaderProperties& arrow_reader_properties = default_arrow_reader_properties(),
       bool nullable = true) {
     std::shared_ptr<Table> table = MakeSimpleTable(values, nullable);
     this->ResetSink();
-
     ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), this->sink_,
                                   values->length(), default_writer_properties(),
-                                  arrow_properties));
+                                  arrow_writer_properties));
 
     std::shared_ptr<Table> out;
     std::unique_ptr<FileReader> reader;
     ASSERT_NO_FATAL_FAILURE(this->ReaderFromSink(&reader, arrow_reader_properties));
-    const bool expect_metadata = arrow_properties->store_schema();
+    const bool expect_metadata = arrow_writer_properties->store_schema();
     ASSERT_NO_FATAL_FAILURE(
         this->ReadTableFromFile(std::move(reader), expect_metadata, &out));
     ASSERT_EQ(1, out->num_columns());
@@ -1369,8 +1368,6 @@ TEST_F(TestStringParquetIO, Basics) {
   }
   ASSERT_OK(builder.Finish(&values));
 
-  // Input is narrow array, but expected output is large array, opposite of the above tests.
-  // This validates narrow arrays can be read as large arrays.
   this->RoundTripSingleColumn(values, values,
                               default_arrow_writer_properties());
 

From 5fcf4e1a9afe615df92f36627a4b96cdf6c89e89 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Wed, 21 Jun 2023 09:01:06 -0300
Subject: [PATCH 54/69] use raw filereaderbuilder instead of adding a new
 openfile function

---
 cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 8 ++++++--
 cpp/src/parquet/arrow/reader.cc                   | 7 +------
 cpp/src/parquet/arrow/reader.h                    | 6 ------
 3 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index f575aa3908633..ccc5a7cec42aa 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -621,8 +621,12 @@ class ParquetIOTestBase : public ::testing::Test {
   void ReaderFromSink(std::unique_ptr<FileReader>* out,
                       const ArrowReaderProperties& arrow_reader_properties) {
     ASSERT_OK_AND_ASSIGN(auto buffer, sink_->Finish());
-    ASSERT_OK_NO_THROW(OpenFile(std::make_shared<BufferReader>(buffer),
-                                ::arrow::default_memory_pool(), arrow_reader_properties, out));
+
+    FileReaderBuilder builder;
+
+    ASSERT_OK_NO_THROW(builder.Open(std::make_shared<BufferReader>(buffer)));
+
+    ASSERT_OK_NO_THROW(builder.properties(arrow_reader_properties)->memory_pool(::arrow::default_memory_pool())->Build(out));
   }
 
   void ReadSingleColumnFile(std::unique_ptr<FileReader> file_reader,
diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 87ec4cc9141f6..861e5b8011dae 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -1372,17 +1372,12 @@ Result<std::unique_ptr<FileReader>> FileReaderBuilder::Build() {
   return out;
 }
 
-Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile> file, MemoryPool* pool,
-                std::unique_ptr<FileReader>* reader) {
-  return OpenFile(std::move(file), pool, default_arrow_reader_properties(), reader);
-}
 
 Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile> file, MemoryPool* pool,
-                const ArrowReaderProperties& arrow_reader_properties,
                 std::unique_ptr<FileReader>* reader) {
   FileReaderBuilder builder;
   RETURN_NOT_OK(builder.Open(std::move(file)));
-  return builder.properties(arrow_reader_properties)->memory_pool(pool)->Build(reader);
+  return builder.memory_pool(pool)->Build(reader);
 }
 
 namespace internal {
diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h
index 0cfa6eb464927..2cbd36176f5e3 100644
--- a/cpp/src/parquet/arrow/reader.h
+++ b/cpp/src/parquet/arrow/reader.h
@@ -369,12 +369,6 @@ ::arrow::Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile>,
                          ::arrow::MemoryPool* allocator,
                          std::unique_ptr<FileReader>* reader);
 
-PARQUET_EXPORT
-::arrow::Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile>,
-                         ::arrow::MemoryPool* allocator,
-                         const ArrowReaderProperties& arrow_reader_properties,
-                         std::unique_ptr<FileReader>* reader);
-
 /// @}
 
 PARQUET_EXPORT

From 8901cbcacf39a7b2abdb5a2c76d97643e3dce01f Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Wed, 21 Jun 2023 09:04:26 -0300
Subject: [PATCH 55/69] rename test

---
 cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index ccc5a7cec42aa..0c703d6c2b44a 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -1363,7 +1363,7 @@ TEST_F(TestUInt32ParquetIO, Parquet_1_0_Compatibility) {
 
 using TestStringParquetIO = TestParquetIO<::arrow::StringType>;
 
-TEST_F(TestStringParquetIO, Basics) {
+TEST_F(TestStringParquetIO, NonOverflowStringWithUseLargeBinaryVariantsSetting) {
   std::shared_ptr<Array> values;
 
   ::arrow::StringBuilder builder;

From dff017a223d7f7ecfd2abd7e4fbe750b3f3a6f7c Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Wed, 21 Jun 2023 10:01:01 -0300
Subject: [PATCH 56/69] update test file name

---
 cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index 0c703d6c2b44a..1a44b2b526916 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -3920,8 +3920,9 @@ TEST(TestArrowReaderAdHoc, CorruptedSchema) {
   TryReadDataFile(path, ::arrow::StatusCode::IOError);
 }
 
+#ifdef ARROW_WITH_BROTLI
 TEST(TestArrowParquet, LargeByteArray) {
-  auto path = test::get_data_file("chunked_string_map.parquet");
+  auto path = test::get_data_file("large_string_map.brotli.parquet");
   TryReadDataFile(path, ::arrow::StatusCode::NotImplemented);
   ArrowReaderProperties reader_properties;
   reader_properties.set_use_large_binary_variants(true);
@@ -3930,6 +3931,7 @@ TEST(TestArrowParquet, LargeByteArray) {
   reader_properties.set_read_dictionary(0, true);
   TryReadDataFileWithProperties(path, reader_properties);
 }
+#endif
 
 TEST(TestArrowReaderAdHoc, LARGE_MEMORY_TEST(LargeStringColumn)) {
   // ARROW-3762

From 232e01fdd2e479ed6a2f42fc318eb6a82a0248d5 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Wed, 21 Jun 2023 14:18:40 -0300
Subject: [PATCH 57/69] update submodule?

---
 cpp/submodules/parquet-testing | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing
index b2e7cc7551591..d79a0101d90df 160000
--- a/cpp/submodules/parquet-testing
+++ b/cpp/submodules/parquet-testing
@@ -1 +1 @@
-Subproject commit b2e7cc755159196e3a068c8594f7acbaecfdaaac
+Subproject commit d79a0101d90dfa3bbb10337626f57a3e8c4b5363

From d7d76c67363eb72edd4a900f14f251be23fa82dd Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Wed, 21 Jun 2023 14:46:05 -0300
Subject: [PATCH 58/69] aply clang-format

---
 .../parquet/arrow/arrow_reader_writer_test.cc | 52 +++++++++----------
 cpp/src/parquet/arrow/reader.cc               |  1 -
 cpp/src/parquet/arrow/schema.cc               |  5 +-
 cpp/src/parquet/encoding.cc                   | 29 +++++++----
 4 files changed, 47 insertions(+), 40 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index 1a44b2b526916..fdf5d6a43411f 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -438,8 +438,8 @@ void CheckConfiguredRoundtrip(
 void DoSimpleRoundtrip(const std::shared_ptr<Table>& table, bool use_threads,
                        int64_t row_group_size, const std::vector<int>& column_subset,
                        std::shared_ptr<Table>* out,
-                       const std::shared_ptr<ArrowWriterProperties>& arrow_writer_properties =
-                           default_arrow_writer_properties(),
+                       const std::shared_ptr<ArrowWriterProperties>&
+                           arrow_writer_properties = default_arrow_writer_properties(),
                        const ArrowReaderProperties& arrow_reader_properties =
                            default_arrow_reader_properties()) {
   std::shared_ptr<Buffer> buffer;
@@ -490,17 +490,15 @@ void DoRoundTripWithBatches(
   ASSERT_OK_AND_ASSIGN(*out, Table::FromRecordBatchReader(batch_reader.get()));
 }
 
-void CheckSimpleRoundtrip(
-    const std::shared_ptr<Table>& table, int64_t row_group_size,
-    const std::shared_ptr<ArrowWriterProperties>& arrow_writer_properties =
-        default_arrow_writer_properties(),
-    const ArrowReaderProperties& arrow_reader_properties =
-        default_arrow_reader_properties()) {
+void CheckSimpleRoundtrip(const std::shared_ptr<Table>& table, int64_t row_group_size,
+                          const std::shared_ptr<ArrowWriterProperties>&
+                              arrow_writer_properties = default_arrow_writer_properties(),
+                          const ArrowReaderProperties& arrow_reader_properties =
+                              default_arrow_reader_properties()) {
   std::shared_ptr<Table> result;
-  ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip(table, false /* use_threads */,
-                                            row_group_size, {}, &result,
-                                            arrow_writer_properties,
-                                            arrow_reader_properties));
+  ASSERT_NO_FATAL_FAILURE(
+      DoSimpleRoundtrip(table, false /* use_threads */, row_group_size, {}, &result,
+                        arrow_writer_properties, arrow_reader_properties));
   ::arrow::AssertSchemaEqual(*table->schema(), *result->schema(),
                              /*check_metadata=*/false);
   ASSERT_OK(result->ValidateFull());
@@ -626,7 +624,9 @@ class ParquetIOTestBase : public ::testing::Test {
 
     ASSERT_OK_NO_THROW(builder.Open(std::make_shared<BufferReader>(buffer)));
 
-    ASSERT_OK_NO_THROW(builder.properties(arrow_reader_properties)->memory_pool(::arrow::default_memory_pool())->Build(out));
+    ASSERT_OK_NO_THROW(builder.properties(arrow_reader_properties)
+                           ->memory_pool(::arrow::default_memory_pool())
+                           ->Build(out));
   }
 
   void ReadSingleColumnFile(std::unique_ptr<FileReader> file_reader,
@@ -675,7 +675,8 @@ class ParquetIOTestBase : public ::testing::Test {
   void RoundTripSingleColumn(
       const std::shared_ptr<Array>& values, const std::shared_ptr<Array>& expected,
       const std::shared_ptr<::parquet::ArrowWriterProperties>& arrow_writer_properties,
-      const ArrowReaderProperties& arrow_reader_properties = default_arrow_reader_properties(),
+      const ArrowReaderProperties& arrow_reader_properties =
+          default_arrow_reader_properties(),
       bool nullable = true) {
     std::shared_ptr<Table> table = MakeSimpleTable(values, nullable);
     this->ResetSink();
@@ -724,10 +725,12 @@ class ParquetIOTestBase : public ::testing::Test {
     CheckSimpleRoundtrip(table, table->num_rows());
   }
 
-  void CheckRoundTrip(const std::shared_ptr<Table>& table,
-                      const std::shared_ptr<ArrowWriterProperties>& arrow_writer_properties,
-                      const ArrowReaderProperties& arrow_reader_properties) {
-    CheckSimpleRoundtrip(table, table->num_rows(), arrow_writer_properties, arrow_reader_properties);
+  void CheckRoundTrip(
+      const std::shared_ptr<Table>& table,
+      const std::shared_ptr<ArrowWriterProperties>& arrow_writer_properties,
+      const ArrowReaderProperties& arrow_reader_properties) {
+    CheckSimpleRoundtrip(table, table->num_rows(), arrow_writer_properties,
+                         arrow_reader_properties);
   }
 
   template <typename ArrayType>
@@ -1372,8 +1375,7 @@ TEST_F(TestStringParquetIO, NonOverflowStringWithUseLargeBinaryVariantsSetting)
   }
   ASSERT_OK(builder.Finish(&values));
 
-  this->RoundTripSingleColumn(values, values,
-                              default_arrow_writer_properties());
+  this->RoundTripSingleColumn(values, values, default_arrow_writer_properties());
 
   ArrowReaderProperties arrow_reader_properties;
   arrow_reader_properties.set_use_large_binary_variants(true);
@@ -1381,8 +1383,7 @@ TEST_F(TestStringParquetIO, NonOverflowStringWithUseLargeBinaryVariantsSetting)
   ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> casted,
                        ::arrow::compute::Cast(*values, ::arrow::large_utf8()));
 
-  this->RoundTripSingleColumn(values, casted,
-                              default_arrow_writer_properties(),
+  this->RoundTripSingleColumn(values, casted, default_arrow_writer_properties(),
                               arrow_reader_properties);
 }
 
@@ -1435,11 +1436,10 @@ TEST_F(TestLargeBinaryParquetIO, Basics) {
 
   ArrowReaderProperties arrow_reader_properties;
   arrow_reader_properties.set_use_large_binary_variants(true);
-  // Input is narrow array, but expected output is large array, opposite of the above tests.
-  // This validates narrow arrays can be read as large arrays.
+  // Input is narrow array, but expected output is large array, opposite of the above
+  // tests. This validates narrow arrays can be read as large arrays.
   this->RoundTripSingleColumn(narrow_array, large_array,
-                              default_arrow_writer_properties(),
-                              arrow_reader_properties);
+                              default_arrow_writer_properties(), arrow_reader_properties);
 }
 
 using TestLargeStringParquetIO = TestParquetIO<::arrow::LargeStringType>;
diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 861e5b8011dae..c74a93f419e5c 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -1372,7 +1372,6 @@ Result<std::unique_ptr<FileReader>> FileReaderBuilder::Build() {
   return out;
 }
 
-
 Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile> file, MemoryPool* pool,
                 std::unique_ptr<FileReader>* reader) {
   FileReaderBuilder builder;
diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
index 445bc017f5b30..b58ebedb62737 100644
--- a/cpp/src/parquet/arrow/schema.cc
+++ b/cpp/src/parquet/arrow/schema.cc
@@ -462,8 +462,9 @@ struct SchemaTreeContext {
 
 bool IsDictionaryReadSupported(const ArrowType& type) {
   // Only supported currently for BYTE_ARRAY types
-  return type.id() == ::arrow::Type::BINARY || type.id() == ::arrow::Type::STRING
-      || type.id() == ::arrow::Type::LARGE_BINARY || type.id() == ::arrow::Type::LARGE_STRING;
+  return type.id() == ::arrow::Type::BINARY || type.id() == ::arrow::Type::STRING ||
+         type.id() == ::arrow::Type::LARGE_BINARY ||
+         type.id() == ::arrow::Type::LARGE_STRING;
 }
 
 // ----------------------------------------------------------------------
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 827309646b7f3..7debd21dd39be 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -1632,11 +1632,14 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
         [&]() { valid_bytes[i++] = 1; }, [&]() { ++i; });
 
     // It looks like this method is only called by ByteArray types. Previously,
-    // there was an unconditional cast to ::arrow::Dictionary32Builder<::arrow::BinaryType>.
-    // This won't work for LargeByteArrayType and the Type template argument can't be used
-    // unconditionally because it is not defined for several other types.
-    if constexpr (std::is_same_v<ByteArrayType, Type> || std::is_same_v<LargeByteArrayType, Type>) {
-      auto binary_builder = checked_cast<typename EncodingTraits<Type>::DictAccumulator*>(builder);
+    // there was an unconditional cast to
+    // ::arrow::Dictionary32Builder<::arrow::BinaryType>. This won't work for
+    // LargeByteArrayType and the Type template argument can't be used unconditionally
+    // because it is not defined for several other types.
+    if constexpr (std::is_same_v<ByteArrayType, Type> ||
+                  std::is_same_v<LargeByteArrayType, Type>) {
+      auto binary_builder =
+          checked_cast<typename EncodingTraits<Type>::DictAccumulator*>(builder);
       PARQUET_THROW_NOT_OK(
           binary_builder->AppendIndices(indices_buffer, num_values, valid_bytes.data()));
       num_values_ -= num_values - null_count;
@@ -1662,11 +1665,14 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
     }
 
     // It looks like this method is only called by ByteArray types. Previously,
-    // there was an unconditional cast to ::arrow::Dictionary32Builder<::arrow::BinaryType>.
-    // This won't work for LargeByteArrayType and the Type template argument can't be used
-    // unconditionally because it is not defined for several other types.
-    if constexpr (std::is_same_v<ByteArrayType, Type> || std::is_same_v<LargeByteArrayType, Type>) {
-      auto binary_builder = checked_cast<typename EncodingTraits<Type>::DictAccumulator*>(builder);
+    // there was an unconditional cast to
+    // ::arrow::Dictionary32Builder<::arrow::BinaryType>. This won't work for
+    // LargeByteArrayType and the Type template argument can't be used unconditionally
+    // because it is not defined for several other types.
+    if constexpr (std::is_same_v<ByteArrayType, Type> ||
+                  std::is_same_v<LargeByteArrayType, Type>) {
+      auto binary_builder =
+          checked_cast<typename EncodingTraits<Type>::DictAccumulator*>(builder);
       PARQUET_THROW_NOT_OK(binary_builder->AppendIndices(indices_buffer, num_values));
       num_values_ -= num_values;
       return num_values;
@@ -3609,7 +3615,8 @@ std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
       return std::make_unique<DictDecoderImpl<DoubleType>>(descr, pool);
     case Type::BYTE_ARRAY:
       if (use_large_binary_variants) {
-        return std::make_unique<DictByteArrayDecoderImpl<LargeByteArrayType>>(descr, pool);
+        return std::make_unique<DictByteArrayDecoderImpl<LargeByteArrayType>>(descr,
+                                                                              pool);
       } else {
         return std::make_unique<DictByteArrayDecoderImpl<ByteArrayType>>(descr, pool);
       }

From 90ceb0740c0f5dc1305e60861639738d6ff976d9 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Thu, 22 Jun 2023 11:29:10 -0300
Subject: [PATCH 59/69] address minor comments

---
 .../parquet/arrow/arrow_reader_writer_test.cc | 28 ++++---------------
 1 file changed, 5 insertions(+), 23 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index fdf5d6a43411f..4b6b2e3f183e5 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -439,9 +439,7 @@ void DoSimpleRoundtrip(const std::shared_ptr<Table>& table, bool use_threads,
                        int64_t row_group_size, const std::vector<int>& column_subset,
                        std::shared_ptr<Table>* out,
                        const std::shared_ptr<ArrowWriterProperties>&
-                           arrow_writer_properties = default_arrow_writer_properties(),
-                       const ArrowReaderProperties& arrow_reader_properties =
-                           default_arrow_reader_properties()) {
+                           arrow_writer_properties = default_arrow_writer_properties()) {
   std::shared_ptr<Buffer> buffer;
   ASSERT_NO_FATAL_FAILURE(
       WriteTableToBuffer(table, row_group_size, arrow_writer_properties, &buffer));
@@ -492,13 +490,11 @@ void DoRoundTripWithBatches(
 
 void CheckSimpleRoundtrip(const std::shared_ptr<Table>& table, int64_t row_group_size,
                           const std::shared_ptr<ArrowWriterProperties>&
-                              arrow_writer_properties = default_arrow_writer_properties(),
-                          const ArrowReaderProperties& arrow_reader_properties =
-                              default_arrow_reader_properties()) {
+                              arrow_writer_properties = default_arrow_writer_properties()) {
   std::shared_ptr<Table> result;
   ASSERT_NO_FATAL_FAILURE(
       DoSimpleRoundtrip(table, false /* use_threads */, row_group_size, {}, &result,
-                        arrow_writer_properties, arrow_reader_properties));
+                        arrow_writer_properties));
   ::arrow::AssertSchemaEqual(*table->schema(), *result->schema(),
                              /*check_metadata=*/false);
   ASSERT_OK(result->ValidateFull());
@@ -725,14 +721,6 @@ class ParquetIOTestBase : public ::testing::Test {
     CheckSimpleRoundtrip(table, table->num_rows());
   }
 
-  void CheckRoundTrip(
-      const std::shared_ptr<Table>& table,
-      const std::shared_ptr<ArrowWriterProperties>& arrow_writer_properties,
-      const ArrowReaderProperties& arrow_reader_properties) {
-    CheckSimpleRoundtrip(table, table->num_rows(), arrow_writer_properties,
-                         arrow_reader_properties);
-  }
-
   template <typename ArrayType>
   void WriteColumn(const std::shared_ptr<GroupNode>& schema,
                    const std::shared_ptr<ArrayType>& values) {
@@ -1366,14 +1354,8 @@ TEST_F(TestUInt32ParquetIO, Parquet_1_0_Compatibility) {
 
 using TestStringParquetIO = TestParquetIO<::arrow::StringType>;
 
-TEST_F(TestStringParquetIO, NonOverflowStringWithUseLargeBinaryVariantsSetting) {
-  std::shared_ptr<Array> values;
-
-  ::arrow::StringBuilder builder;
-  for (size_t i = 0; i < SMALL_SIZE; i++) {
-    ASSERT_OK(builder.Append("abc"));
-  }
-  ASSERT_OK(builder.Finish(&values));
+TEST_F(TestStringParquetIO, SmallStringWithLargeBinaryVariantSetting) {
+  auto values = ArrayFromJSON(::arrow::utf8(), R"(["foo", "", null, "bar"])");
 
   this->RoundTripSingleColumn(values, values, default_arrow_writer_properties());
 

From 03949636fc1e721fa2ed5dac75621f89efd1af1d Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Thu, 22 Jun 2023 13:47:39 -0300
Subject: [PATCH 60/69] delta & delta length for large*

---
 cpp/src/parquet/encoding.cc | 48 ++++++++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 16 deletions(-)

diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 7debd21dd39be..72389c08e2e9c 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -2821,10 +2821,11 @@ std::shared_ptr<Buffer> DeltaLengthByteArrayEncoder<DType>::FlushValues() {
 // ----------------------------------------------------------------------
 // DeltaLengthByteArrayDecoder
 
-class DeltaLengthByteArrayDecoder : public DecoderImpl,
-                                    virtual public TypedDecoder<ByteArrayType> {
+template <typename BAT>
+class DeltaLengthByteArrayDecoderBase : public DecoderImpl,
+                                    virtual public TypedDecoder<BAT> {
  public:
-  explicit DeltaLengthByteArrayDecoder(const ColumnDescriptor* descr,
+  explicit DeltaLengthByteArrayDecoderBase(const ColumnDescriptor* descr,
                                        MemoryPool* pool = ::arrow::default_memory_pool())
       : DecoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY),
         len_decoder_(nullptr, pool),
@@ -2875,7 +2876,7 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
 
   int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
                   int64_t valid_bits_offset,
-                  typename EncodingTraits<ByteArrayType>::Accumulator* out) override {
+                  typename EncodingTraits<BAT>::Accumulator* out) override {
     int result = 0;
     PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits,
                                           valid_bits_offset, out, &result));
@@ -2884,7 +2885,7 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
 
   int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
                   int64_t valid_bits_offset,
-                  typename EncodingTraits<ByteArrayType>::DictAccumulator* out) override {
+                  typename EncodingTraits<BAT>::DictAccumulator* out) override {
     ParquetException::NYI(
         "DecodeArrow of DictAccumulator for DeltaLengthByteArrayDecoder");
   }
@@ -2910,9 +2911,9 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
 
   Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits,
                           int64_t valid_bits_offset,
-                          typename EncodingTraits<ByteArrayType>::Accumulator* out,
+                          typename EncodingTraits<BAT>::Accumulator* out,
                           int* out_num_values) {
-    ArrowBinaryHelper helper(out);
+    ArrowBinaryHelperBase<BAT> helper(out);
 
     std::vector<ByteArray> values(num_values - null_count);
     const int num_valid_values = Decode(values.data(), num_values - null_count);
@@ -2953,6 +2954,9 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
   std::shared_ptr<ResizableBuffer> buffered_length_;
 };
 
+using DeltaLengthByteArrayDecoder = DeltaLengthByteArrayDecoderBase<ByteArrayType>;
+using DeltaLengthLargeByteArrayDecoder = DeltaLengthByteArrayDecoderBase<LargeByteArrayType>;
+
 // ----------------------------------------------------------------------
 // RLE_BOOLEAN_ENCODER
 
@@ -3143,10 +3147,11 @@ class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
 // ----------------------------------------------------------------------
 // DELTA_BYTE_ARRAY
 
-class DeltaByteArrayDecoder : public DecoderImpl,
-                              virtual public TypedDecoder<ByteArrayType> {
+template <typename BAT>
+class DeltaByteArrayDecoderBase : public DecoderImpl,
+                              virtual public TypedDecoder<BAT> {
  public:
-  explicit DeltaByteArrayDecoder(const ColumnDescriptor* descr,
+  explicit DeltaByteArrayDecoderBase(const ColumnDescriptor* descr,
                                  MemoryPool* pool = ::arrow::default_memory_pool())
       : DecoderImpl(descr, Encoding::DELTA_BYTE_ARRAY),
         prefix_len_decoder_(nullptr, pool),
@@ -3189,7 +3194,7 @@ class DeltaByteArrayDecoder : public DecoderImpl,
 
   int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
                   int64_t valid_bits_offset,
-                  typename EncodingTraits<ByteArrayType>::Accumulator* out) override {
+                  typename EncodingTraits<BAT>::Accumulator* out) override {
     int result = 0;
     PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits,
                                           valid_bits_offset, out, &result));
@@ -3199,7 +3204,7 @@ class DeltaByteArrayDecoder : public DecoderImpl,
   int DecodeArrow(
       int num_values, int null_count, const uint8_t* valid_bits,
       int64_t valid_bits_offset,
-      typename EncodingTraits<ByteArrayType>::DictAccumulator* builder) override {
+      typename EncodingTraits<BAT>::DictAccumulator* builder) override {
     ParquetException::NYI("DecodeArrow of DictAccumulator for DeltaByteArrayDecoder");
   }
 
@@ -3261,9 +3266,9 @@ class DeltaByteArrayDecoder : public DecoderImpl,
 
   Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits,
                           int64_t valid_bits_offset,
-                          typename EncodingTraits<ByteArrayType>::Accumulator* out,
+                          typename EncodingTraits<BAT>::Accumulator* out,
                           int* out_num_values) {
-    ArrowBinaryHelper helper(out);
+    ArrowBinaryHelperBase<BAT> helper(out);
 
     std::vector<ByteArray> values(num_values);
     const int num_valid_values = GetInternal(values.data(), num_values - null_count);
@@ -3306,6 +3311,9 @@ class DeltaByteArrayDecoder : public DecoderImpl,
   std::shared_ptr<ResizableBuffer> buffered_data_;
 };
 
+using DeltaByteArrayDecoder = DeltaByteArrayDecoderBase<ByteArrayType>;
+using DeltaLargeByteArrayDecoder = DeltaByteArrayDecoderBase<LargeByteArrayType>;
+
 // ----------------------------------------------------------------------
 // BYTE_STREAM_SPLIT
 
@@ -3576,12 +3584,20 @@ std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encodin
     }
   } else if (encoding == Encoding::DELTA_BYTE_ARRAY) {
     if (type_num == Type::BYTE_ARRAY) {
-      return std::make_unique<DeltaByteArrayDecoder>(descr, pool);
+      if (use_large_binary_variants) {
+        return std::make_unique<DeltaByteArrayDecoder>(descr);
+      } else {
+        return std::make_unique<DeltaLargeByteArrayDecoder>(descr);
+      }
     }
     throw ParquetException("DELTA_BYTE_ARRAY only supports BYTE_ARRAY");
   } else if (encoding == Encoding::DELTA_LENGTH_BYTE_ARRAY) {
     if (type_num == Type::BYTE_ARRAY) {
-      return std::make_unique<DeltaLengthByteArrayDecoder>(descr, pool);
+      if (use_large_binary_variants) {
+        return std::make_unique<DeltaLengthByteArrayDecoder>(descr, pool);
+      } else {
+        return std::make_unique<DeltaLengthLargeByteArrayDecoder>(descr, pool);
+      }
     }
     throw ParquetException("DELTA_LENGTH_BYTE_ARRAY only supports BYTE_ARRAY");
   } else if (encoding == Encoding::RLE) {

From a8df2e7a15d1851bcb33418f0f6719b9f1d0ad1e Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Thu, 22 Jun 2023 14:03:32 -0300
Subject: [PATCH 61/69] fix wrong if statements

---
 cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 4 +++-
 cpp/src/parquet/encoding.cc                       | 8 ++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index 4b6b2e3f183e5..531d854369433 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -1354,6 +1354,7 @@ TEST_F(TestUInt32ParquetIO, Parquet_1_0_Compatibility) {
 
 using TestStringParquetIO = TestParquetIO<::arrow::StringType>;
 
+#if defined(_WIN64) || defined(__x86_64__)
 TEST_F(TestStringParquetIO, SmallStringWithLargeBinaryVariantSetting) {
   auto values = ArrayFromJSON(::arrow::utf8(), R"(["foo", "", null, "bar"])");
 
@@ -1368,6 +1369,7 @@ TEST_F(TestStringParquetIO, SmallStringWithLargeBinaryVariantSetting) {
   this->RoundTripSingleColumn(values, casted, default_arrow_writer_properties(),
                               arrow_reader_properties);
 }
+#endif
 
 TEST_F(TestStringParquetIO, EmptyStringColumnRequiredWrite) {
   std::shared_ptr<Array> values;
@@ -3902,7 +3904,7 @@ TEST(TestArrowReaderAdHoc, CorruptedSchema) {
   TryReadDataFile(path, ::arrow::StatusCode::IOError);
 }
 
-#ifdef ARROW_WITH_BROTLI
+#if defined(ARROW_WITH_BROTLI) && (defined(_WIN64) || defined(__x86_64__))
 TEST(TestArrowParquet, LargeByteArray) {
   auto path = test::get_data_file("large_string_map.brotli.parquet");
   TryReadDataFile(path, ::arrow::StatusCode::NotImplemented);
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 72389c08e2e9c..0031f48a62dd6 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -3585,18 +3585,18 @@ std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encodin
   } else if (encoding == Encoding::DELTA_BYTE_ARRAY) {
     if (type_num == Type::BYTE_ARRAY) {
       if (use_large_binary_variants) {
-        return std::make_unique<DeltaByteArrayDecoder>(descr);
-      } else {
         return std::make_unique<DeltaLargeByteArrayDecoder>(descr);
+      } else {
+        return std::make_unique<DeltaByteArrayDecoder>(descr);
       }
     }
     throw ParquetException("DELTA_BYTE_ARRAY only supports BYTE_ARRAY");
   } else if (encoding == Encoding::DELTA_LENGTH_BYTE_ARRAY) {
     if (type_num == Type::BYTE_ARRAY) {
       if (use_large_binary_variants) {
-        return std::make_unique<DeltaLengthByteArrayDecoder>(descr, pool);
-      } else {
         return std::make_unique<DeltaLengthLargeByteArrayDecoder>(descr, pool);
+      } else {
+        return std::make_unique<DeltaLengthByteArrayDecoder>(descr, pool);
       }
     }
     throw ParquetException("DELTA_LENGTH_BYTE_ARRAY only supports BYTE_ARRAY");

From 2bb3b14376949c77adf85ec9f0149d23c4db7cfc Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Fri, 23 Jun 2023 09:10:06 -0300
Subject: [PATCH 62/69] Template member variable as well

---
 cpp/src/parquet/encoding.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 0031f48a62dd6..4a8e96948b093 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -3301,7 +3301,7 @@ class DeltaByteArrayDecoderBase : public DecoderImpl,
 
   std::shared_ptr<::arrow::bit_util::BitReader> decoder_;
   DeltaBitPackDecoder<Int32Type> prefix_len_decoder_;
-  DeltaLengthByteArrayDecoder suffix_decoder_;
+  DeltaLengthByteArrayDecoderBase<BAT> suffix_decoder_;
   std::string last_value_;
   // string buffer for last value in previous page
   std::string last_value_in_previous_page_;

From c114d441da3d715a24831aa56f28aadecb3552f1 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Fri, 23 Jun 2023 09:13:29 -0300
Subject: [PATCH 63/69] add docstring

---
 cpp/src/parquet/properties.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index e59b2e4e84254..2b027ff6ab38f 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -852,10 +852,12 @@ class PARQUET_EXPORT ArrowReaderProperties {
     return coerce_int96_timestamp_unit_;
   }
 
+  /// Set whether to use large binary variants for binary data
+  /// (default is false).
   void set_use_large_binary_variants(bool use_large_binary_variants) {
     use_large_binary_variants_ = use_large_binary_variants;
   }
-
+  /// Return whether use large binary variants is enabled.
   bool use_large_binary_variants() const { return use_large_binary_variants_; }
 
  private:

From d1d57989941bcf29f120671a76d9bdf141a4151f Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Fri, 23 Jun 2023 09:16:16 -0300
Subject: [PATCH 64/69] add LargeStringDictionary32Builder

---
 cpp/src/arrow/array/builder_dict.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h
index 3adf5b843b916..f46eaefc74b6f 100644
--- a/cpp/src/arrow/array/builder_dict.h
+++ b/cpp/src/arrow/array/builder_dict.h
@@ -725,6 +725,7 @@ using StringDictionaryBuilder = DictionaryBuilder<StringType>;
 using BinaryDictionary32Builder = Dictionary32Builder<BinaryType>;
 using StringDictionary32Builder = Dictionary32Builder<StringType>;
 using LargeBinaryDictionary32Builder = Dictionary32Builder<LargeBinaryType>;
+using LargeStringDictionary32Builder = Dictionary32Builder<LargeStringType>;
 
 /// @}
 

From 0eaa60fae50d0f27000270bb017b60a47b8a87a8 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Mon, 26 Jun 2023 10:41:31 -0300
Subject: [PATCH 65/69] address a few comments

---
 .../parquet/arrow/arrow_reader_writer_test.cc | 31 +++++++++++++++++--
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index 531d854369433..072d8c6935379 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -617,9 +617,7 @@ class ParquetIOTestBase : public ::testing::Test {
     ASSERT_OK_AND_ASSIGN(auto buffer, sink_->Finish());
 
     FileReaderBuilder builder;
-
     ASSERT_OK_NO_THROW(builder.Open(std::make_shared<BufferReader>(buffer)));
-
     ASSERT_OK_NO_THROW(builder.properties(arrow_reader_properties)
                            ->memory_pool(::arrow::default_memory_pool())
                            ->Build(out));
@@ -4603,16 +4601,22 @@ TEST(TestArrowWriteDictionaries, NestedSubfield) {
 class TestArrowReadDeltaEncoding : public ::testing::Test {
  public:
   void ReadTableFromParquetFile(const std::string& file_name,
+                                const ArrowReaderProperties& properties,
                                 std::shared_ptr<Table>* out) {
     auto file = test::get_data_file(file_name);
     auto pool = ::arrow::default_memory_pool();
     std::unique_ptr<FileReader> parquet_reader;
     ASSERT_OK(FileReader::Make(pool, ParquetFileReader::OpenFile(file, false),
-                               &parquet_reader));
+                               properties, &parquet_reader));
     ASSERT_OK(parquet_reader->ReadTable(out));
     ASSERT_OK((*out)->ValidateFull());
   }
 
+  void ReadTableFromParquetFile(const std::string& file_name,
+                                std::shared_ptr<Table>* out) {
+    return ReadTableFromParquetFile(file_name, default_arrow_reader_properties(), out);
+  }
+
   void ReadTableFromCSVFile(const std::string& file_name,
                             const ::arrow::csv::ConvertOptions& convert_options,
                             std::shared_ptr<Table>* out) {
@@ -4660,6 +4664,27 @@ TEST_F(TestArrowReadDeltaEncoding, DeltaByteArray) {
   ::arrow::AssertTablesEqual(*actual_table, *expect_table, false);
 }
 
+TEST_F(TestArrowReadDeltaEncoding, DeltaByteArrayWithLargeBinaryVariant) {
+  std::shared_ptr<::arrow::Table> actual_table, expect_table;
+  ArrowReaderProperties properties;
+  properties.set_use_large_binary_variants(true);
+
+  ReadTableFromParquetFile("delta_byte_array.parquet", properties, &actual_table);
+
+  auto convert_options = ::arrow::csv::ConvertOptions::Defaults();
+  std::vector<std::string> column_names = {
+      "c_customer_id", "c_salutation",          "c_first_name",
+      "c_last_name",   "c_preferred_cust_flag", "c_birth_country",
+      "c_login",       "c_email_address",       "c_last_review_date"};
+  for (auto name : column_names) {
+    convert_options.column_types[name] = ::arrow::large_utf8();
+  }
+  convert_options.strings_can_be_null = true;
+  ReadTableFromCSVFile("delta_byte_array_expect.csv", convert_options, &expect_table);
+
+  ::arrow::AssertTablesEqual(*actual_table, *expect_table, false);
+}
+
 TEST_F(TestArrowReadDeltaEncoding, IncrementalDecodeDeltaByteArray) {
   auto file = test::get_data_file("delta_byte_array.parquet");
   auto pool = ::arrow::default_memory_pool();

From 1e642fae8ec8f307fd327fd6e933558b05422278 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Mon, 26 Jun 2023 10:42:34 -0300
Subject: [PATCH 66/69] clang format

---
 .../parquet/arrow/arrow_reader_writer_test.cc | 17 ++++++++-------
 cpp/src/parquet/encoding.cc                   | 21 +++++++++----------
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index 072d8c6935379..e1aa14e9a9442 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -488,13 +488,14 @@ void DoRoundTripWithBatches(
   ASSERT_OK_AND_ASSIGN(*out, Table::FromRecordBatchReader(batch_reader.get()));
 }
 
-void CheckSimpleRoundtrip(const std::shared_ptr<Table>& table, int64_t row_group_size,
-                          const std::shared_ptr<ArrowWriterProperties>&
-                              arrow_writer_properties = default_arrow_writer_properties()) {
+void CheckSimpleRoundtrip(
+    const std::shared_ptr<Table>& table, int64_t row_group_size,
+    const std::shared_ptr<ArrowWriterProperties>& arrow_writer_properties =
+        default_arrow_writer_properties()) {
   std::shared_ptr<Table> result;
-  ASSERT_NO_FATAL_FAILURE(
-      DoSimpleRoundtrip(table, false /* use_threads */, row_group_size, {}, &result,
-                        arrow_writer_properties));
+  ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip(table, false /* use_threads */,
+                                            row_group_size, {}, &result,
+                                            arrow_writer_properties));
   ::arrow::AssertSchemaEqual(*table->schema(), *result->schema(),
                              /*check_metadata=*/false);
   ASSERT_OK(result->ValidateFull());
@@ -4606,8 +4607,8 @@ class TestArrowReadDeltaEncoding : public ::testing::Test {
     auto file = test::get_data_file(file_name);
     auto pool = ::arrow::default_memory_pool();
     std::unique_ptr<FileReader> parquet_reader;
-    ASSERT_OK(FileReader::Make(pool, ParquetFileReader::OpenFile(file, false),
-                               properties, &parquet_reader));
+    ASSERT_OK(FileReader::Make(pool, ParquetFileReader::OpenFile(file, false), properties,
+                               &parquet_reader));
     ASSERT_OK(parquet_reader->ReadTable(out));
     ASSERT_OK((*out)->ValidateFull());
   }
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 4a8e96948b093..3d6bb01bf9752 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -2823,10 +2823,10 @@ std::shared_ptr<Buffer> DeltaLengthByteArrayEncoder<DType>::FlushValues() {
 
 template <typename BAT>
 class DeltaLengthByteArrayDecoderBase : public DecoderImpl,
-                                    virtual public TypedDecoder<BAT> {
+                                        virtual public TypedDecoder<BAT> {
  public:
-  explicit DeltaLengthByteArrayDecoderBase(const ColumnDescriptor* descr,
-                                       MemoryPool* pool = ::arrow::default_memory_pool())
+  explicit DeltaLengthByteArrayDecoderBase(
+      const ColumnDescriptor* descr, MemoryPool* pool = ::arrow::default_memory_pool())
       : DecoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY),
         len_decoder_(nullptr, pool),
         buffered_length_(AllocateBuffer(pool, 0)) {}
@@ -2955,7 +2955,8 @@ class DeltaLengthByteArrayDecoderBase : public DecoderImpl,
 };
 
 using DeltaLengthByteArrayDecoder = DeltaLengthByteArrayDecoderBase<ByteArrayType>;
-using DeltaLengthLargeByteArrayDecoder = DeltaLengthByteArrayDecoderBase<LargeByteArrayType>;
+using DeltaLengthLargeByteArrayDecoder =
+    DeltaLengthByteArrayDecoderBase<LargeByteArrayType>;
 
 // ----------------------------------------------------------------------
 // RLE_BOOLEAN_ENCODER
@@ -3148,11 +3149,10 @@ class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
 // DELTA_BYTE_ARRAY
 
 template <typename BAT>
-class DeltaByteArrayDecoderBase : public DecoderImpl,
-                              virtual public TypedDecoder<BAT> {
+class DeltaByteArrayDecoderBase : public DecoderImpl, virtual public TypedDecoder<BAT> {
  public:
   explicit DeltaByteArrayDecoderBase(const ColumnDescriptor* descr,
-                                 MemoryPool* pool = ::arrow::default_memory_pool())
+                                     MemoryPool* pool = ::arrow::default_memory_pool())
       : DecoderImpl(descr, Encoding::DELTA_BYTE_ARRAY),
         prefix_len_decoder_(nullptr, pool),
         suffix_decoder_(nullptr, pool),
@@ -3201,10 +3201,9 @@ class DeltaByteArrayDecoderBase : public DecoderImpl,
     return result;
   }
 
-  int DecodeArrow(
-      int num_values, int null_count, const uint8_t* valid_bits,
-      int64_t valid_bits_offset,
-      typename EncodingTraits<BAT>::DictAccumulator* builder) override {
+  int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                  int64_t valid_bits_offset,
+                  typename EncodingTraits<BAT>::DictAccumulator* builder) override {
     ParquetException::NYI("DecodeArrow of DictAccumulator for DeltaByteArrayDecoder");
   }
 

From b299497b3d08245fcc3ff6df4e483026bdc43a67 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Mon, 26 Jun 2023 15:52:41 -0300
Subject: [PATCH 67/69] add binarypacked test for largebinaryvariant

---
 .../parquet/arrow/arrow_reader_writer_test.cc  | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index e1aa14e9a9442..7ae5262e128a8 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -4647,6 +4647,24 @@ TEST_F(TestArrowReadDeltaEncoding, DeltaBinaryPacked) {
   ::arrow::AssertTablesEqual(*actual_table, *expect_table);
 }
 
+TEST_F(TestArrowReadDeltaEncoding, DeltaBinaryPackedWithLargeBinaryVariant) {
+  std::shared_ptr<::arrow::Table> actual_table, expect_table;
+  ArrowReaderProperties properties;
+  properties.set_use_large_binary_variants(true);
+
+  ReadTableFromParquetFile("delta_binary_packed.parquet", properties, &actual_table);
+
+  auto convert_options = ::arrow::csv::ConvertOptions::Defaults();
+  for (int i = 0; i <= 64; ++i) {
+    std::string column_name = "bitwidth" + std::to_string(i);
+    convert_options.column_types[column_name] = ::arrow::int64();
+  }
+  convert_options.column_types["int_value"] = ::arrow::int32();
+  ReadTableFromCSVFile("delta_binary_packed_expect.csv", convert_options, &expect_table);
+
+  ::arrow::AssertTablesEqual(*actual_table, *expect_table);
+}
+
 TEST_F(TestArrowReadDeltaEncoding, DeltaByteArray) {
   std::shared_ptr<::arrow::Table> actual_table, expect_table;
   ReadTableFromParquetFile("delta_byte_array.parquet", &actual_table);

From 2c23dd701f74e8586c42ea36a0bc4ae1fb34b578 Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Tue, 27 Jun 2023 09:22:02 -0300
Subject: [PATCH 68/69] Revert "add binarypacked test for largebinaryvariant"

This reverts commit b299497b3d08245fcc3ff6df4e483026bdc43a67.
---
 .../parquet/arrow/arrow_reader_writer_test.cc  | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index 7ae5262e128a8..e1aa14e9a9442 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -4647,24 +4647,6 @@ TEST_F(TestArrowReadDeltaEncoding, DeltaBinaryPacked) {
   ::arrow::AssertTablesEqual(*actual_table, *expect_table);
 }
 
-TEST_F(TestArrowReadDeltaEncoding, DeltaBinaryPackedWithLargeBinaryVariant) {
-  std::shared_ptr<::arrow::Table> actual_table, expect_table;
-  ArrowReaderProperties properties;
-  properties.set_use_large_binary_variants(true);
-
-  ReadTableFromParquetFile("delta_binary_packed.parquet", properties, &actual_table);
-
-  auto convert_options = ::arrow::csv::ConvertOptions::Defaults();
-  for (int i = 0; i <= 64; ++i) {
-    std::string column_name = "bitwidth" + std::to_string(i);
-    convert_options.column_types[column_name] = ::arrow::int64();
-  }
-  convert_options.column_types["int_value"] = ::arrow::int32();
-  ReadTableFromCSVFile("delta_binary_packed_expect.csv", convert_options, &expect_table);
-
-  ::arrow::AssertTablesEqual(*actual_table, *expect_table);
-}
-
 TEST_F(TestArrowReadDeltaEncoding, DeltaByteArray) {
   std::shared_ptr<::arrow::Table> actual_table, expect_table;
   ReadTableFromParquetFile("delta_byte_array.parquet", &actual_table);

From eca9d6f9bff17c6eb64eab069cdebbcfa88f976a Mon Sep 17 00:00:00 2001
From: Arthur Passos <arthur.ti@outlook.com>
Date: Thu, 6 Jul 2023 09:10:08 -0300
Subject: [PATCH 69/69] only run largebinary tests if system is 64bit

---
 cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 6 ++++--
 cpp/src/parquet/encoding.cc                       | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index e1aa14e9a9442..2f3e8953daaf0 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -1353,7 +1353,7 @@ TEST_F(TestUInt32ParquetIO, Parquet_1_0_Compatibility) {
 
 using TestStringParquetIO = TestParquetIO<::arrow::StringType>;
 
-#if defined(_WIN64) || defined(__x86_64__)
+#if defined(_WIN64) || defined(__LP64__)
 TEST_F(TestStringParquetIO, SmallStringWithLargeBinaryVariantSetting) {
   auto values = ArrayFromJSON(::arrow::utf8(), R"(["foo", "", null, "bar"])");
 
@@ -1397,6 +1397,7 @@ TEST_F(TestStringParquetIO, EmptyStringColumnRequiredWrite) {
 
 using TestLargeBinaryParquetIO = TestParquetIO<::arrow::LargeBinaryType>;
 
+#if defined(_WIN64) || defined(__LP64__)
 TEST_F(TestLargeBinaryParquetIO, Basics) {
   const char* json = "[\"foo\", \"\", null, \"\xff\"]";
 
@@ -1447,6 +1448,7 @@ TEST_F(TestLargeStringParquetIO, Basics) {
       ::parquet::ArrowWriterProperties::Builder().store_schema()->build();
   this->RoundTripSingleColumn(large_array, large_array, arrow_properties);
 }
+#endif
 
 using TestNullParquetIO = TestParquetIO<::arrow::NullType>;
 
@@ -3903,7 +3905,7 @@ TEST(TestArrowReaderAdHoc, CorruptedSchema) {
   TryReadDataFile(path, ::arrow::StatusCode::IOError);
 }
 
-#if defined(ARROW_WITH_BROTLI) && (defined(_WIN64) || defined(__x86_64__))
+#if defined(ARROW_WITH_BROTLI) && defined(__LP64__)
 TEST(TestArrowParquet, LargeByteArray) {
   auto path = test::get_data_file("large_string_map.brotli.parquet");
   TryReadDataFile(path, ::arrow::StatusCode::NotImplemented);
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 3d6bb01bf9752..bb931ecb5e929 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -1534,7 +1534,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
     offset_type total_size = 0;
     for (int i = 0; i < dictionary_length_; ++i) {
       if (AddWithOverflow(total_size, dict_values[i].len, &total_size)) {
-        throw ParquetException("String/Binary length to large");
+        throw ParquetException("String/Binary length too large");
       }
     }
     PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,