apache · arthurpassos · May 26, 2023 · May 26, 2023 · May 29, 2023 · May 29, 2023
diff --git a/cpp/examples/parquet/parquet_arrow/reader_writer.cc b/cpp/examples/parquet/parquet_arrow/reader_writer.cc
@@ -137,4 +137,4 @@ int main(int argc, char** argv) {
   read_single_rowgroup();
   read_single_column();
   read_single_column_chunk();
-}
+}
diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h
@@ -724,6 +724,7 @@ using BinaryDictionaryBuilder = DictionaryBuilder<BinaryType>;
 using StringDictionaryBuilder = DictionaryBuilder<StringType>;
 using BinaryDictionary32Builder = Dictionary32Builder<BinaryType>;
 using StringDictionary32Builder = Dictionary32Builder<StringType>;
+using LargeBinaryDictionary32Builder = Dictionary32Builder<LargeBinaryType>;
 
 /// @}
 

diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
@@ -678,6 +678,8 @@ class ARROW_EXPORT BaseBinaryType : public DataType {
 
 constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max() - 1;
 
+constexpr int64_t kLargeBinaryMemoryLimit = std::numeric_limits<int64_t>::max() - 1;
+
 /// \addtogroup binary-datatypes
 ///
 /// @{

@@ -219,6 +219,7 @@ class FileReaderImpl : public FileReader {
     ctx->iterator_factory = SomeRowGroupsFactory(row_groups);
     ctx->filter_leaves = true;
     ctx->included_leaves = included_leaves;
+    ctx->use_binary_large_variants = reader_properties_.use_binary_large_variants();
     return GetReader(manifest_.schema_fields[i], ctx, out);
   }
 
@@ -462,7 +463,7 @@ class LeafReader : public ColumnReaderImpl {
         input_(std::move(input)),
         descr_(input_->descr()) {
     record_reader_ = RecordReader::Make(
-        descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY);
+        descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY, false, ctx_->use_binary_large_variants);
     NextRowGroup();
   }
 

@@ -85,6 +85,7 @@ using ::arrow::internal::SafeLeftShift;
 using ::arrow::util::SafeLoadAs;
 
 using parquet::internal::BinaryRecordReader;
+using parquet::internal::LargeBinaryRecordReader;
 using parquet::internal::DictionaryRecordReader;
 using parquet::internal::RecordReader;
 using parquet::schema::GroupNode;

@@ -109,6 +109,7 @@ struct ReaderContext {
   FileColumnIteratorFactory iterator_factory;
   bool filter_leaves;
   std::shared_ptr<std::unordered_set<int>> included_leaves;
+  bool use_binary_large_variants = false;
 
   bool IncludesLeaf(int leaf_index) const {
     if (this->filter_leaves) {

@@ -473,7 +473,7 @@ ::arrow::Result<std::shared_ptr<ArrowType>> GetTypeForNode(
     SchemaTreeContext* ctx) {
   ASSIGN_OR_RAISE(
       std::shared_ptr<ArrowType> storage_type,
-      GetArrowType(primitive_node, ctx->properties.coerce_int96_timestamp_unit()));
+      GetArrowType(primitive_node, ctx->properties.coerce_int96_timestamp_unit(), ctx->properties.use_binary_large_variants()));
   if (ctx->properties.read_dictionary(column_index) &&
       IsDictionaryReadSupported(*storage_type)) {
     return ::arrow::dictionary(::arrow::int32(), storage_type);

@@ -127,6 +127,23 @@ Result<std::shared_ptr<ArrowType>> FromByteArray(const LogicalType& logical_type
   }
 }
 
+Result<std::shared_ptr<ArrowType>> FromLargeByteArray(const LogicalType& logical_type) {
+  switch (logical_type.type()) {
+    case LogicalType::Type::STRING:
+      return ::arrow::large_utf8();
+    case LogicalType::Type::DECIMAL:
+      return MakeArrowDecimal(logical_type);
+    case LogicalType::Type::NONE:
+    case LogicalType::Type::ENUM:
+    case LogicalType::Type::JSON:
+    case LogicalType::Type::BSON:
+      return ::arrow::large_binary();
+    default:
+      return Status::NotImplemented("Unhandled logical logical_type ",
+                                    logical_type.ToString(), " for binary array");
+  }
+}
+
 Result<std::shared_ptr<ArrowType>> FromFLBA(const LogicalType& logical_type,
                                             int32_t physical_length) {
   switch (logical_type.type()) {
@@ -181,7 +198,7 @@ Result<std::shared_ptr<ArrowType>> FromInt64(const LogicalType& logical_type) {
 
 Result<std::shared_ptr<ArrowType>> GetArrowType(
     Type::type physical_type, const LogicalType& logical_type, int type_length,
-    const ::arrow::TimeUnit::type int96_arrow_time_unit) {
+    const ::arrow::TimeUnit::type int96_arrow_time_unit, bool use_binary_large_variant) {
   if (logical_type.is_invalid() || logical_type.is_null()) {
     return ::arrow::null();
   }
@@ -200,7 +217,7 @@ Result<std::shared_ptr<ArrowType>> GetArrowType(
     case ParquetType::DOUBLE:
       return ::arrow::float64();
     case ParquetType::BYTE_ARRAY:
-      return FromByteArray(logical_type);
+      return use_binary_large_variant ? FromLargeByteArray(logical_type) : FromByteArray(logical_type);
     case ParquetType::FIXED_LEN_BYTE_ARRAY:
       return FromFLBA(logical_type, type_length);
     default: {
@@ -213,9 +230,10 @@ Result<std::shared_ptr<ArrowType>> GetArrowType(
 
 Result<std::shared_ptr<ArrowType>> GetArrowType(
     const schema::PrimitiveNode& primitive,
-    const ::arrow::TimeUnit::type int96_arrow_time_unit) {
+    const ::arrow::TimeUnit::type int96_arrow_time_unit,
+    bool use_binary_large_variant) {
   return GetArrowType(primitive.physical_type(), *primitive.logical_type(),
-                      primitive.type_length(), int96_arrow_time_unit);
+                      primitive.type_length(), int96_arrow_time_unit, use_binary_large_variant);
 }
 
 }  // namespace arrow

@@ -30,22 +30,27 @@ namespace arrow {
 using ::arrow::Result;
 
 Result<std::shared_ptr<::arrow::DataType>> FromByteArray(const LogicalType& logical_type);
+Result<std::shared_ptr<::arrow::DataType>> FromLargeByteArray(const LogicalType& logical_type);
+
 Result<std::shared_ptr<::arrow::DataType>> FromFLBA(const LogicalType& logical_type,
                                                     int32_t physical_length);
 Result<std::shared_ptr<::arrow::DataType>> FromInt32(const LogicalType& logical_type);
 Result<std::shared_ptr<::arrow::DataType>> FromInt64(const LogicalType& logical_type);
 
 Result<std::shared_ptr<::arrow::DataType>> GetArrowType(Type::type physical_type,
                                                         const LogicalType& logical_type,
-                                                        int type_length);
+                                                        int type_length,
+                                                        bool use_binary_large_variant = false);
 
 Result<std::shared_ptr<::arrow::DataType>> GetArrowType(
     Type::type physical_type, const LogicalType& logical_type, int type_length,
-    ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO);
+    ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO,
+    bool use_binary_large_variant = false);
 
 Result<std::shared_ptr<::arrow::DataType>> GetArrowType(
     const schema::PrimitiveNode& primitive,
-    ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO);
+    ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO,
+    bool use_binary_large_variant = false);
 
 }  // namespace arrow
 }  // namespace parquet
@@ -1988,33 +1988,33 @@ class TypedRecordReader : public TypedColumnReaderImpl<DType>,
   }
 
   void DebugPrintState() override {
-    const int16_t* def_levels = this->def_levels();
-    const int16_t* rep_levels = this->rep_levels();
-    const int64_t total_levels_read = levels_position_;
-
-    const T* vals = reinterpret_cast<const T*>(this->values());
-
-    if (leaf_info_.def_level > 0) {
-      std::cout << "def levels: ";
-      for (int64_t i = 0; i < total_levels_read; ++i) {
-        std::cout << def_levels[i] << " ";
-      }
-      std::cout << std::endl;
-    }
-
-    if (leaf_info_.rep_level > 0) {
-      std::cout << "rep levels: ";
-      for (int64_t i = 0; i < total_levels_read; ++i) {
-        std::cout << rep_levels[i] << " ";
-      }
-      std::cout << std::endl;
-    }
-
-    std::cout << "values: ";
-    for (int64_t i = 0; i < this->values_written(); ++i) {
-      std::cout << vals[i] << " ";
-    }
-    std::cout << std::endl;
+//    const int16_t* def_levels = this->def_levels();
+//    const int16_t* rep_levels = this->rep_levels();
+//    const int64_t total_levels_read = levels_position_;
+//
+//    const T* vals = reinterpret_cast<const T*>(this->values());
+//
+//    if (leaf_info_.def_level > 0) {
+//      std::cout << "def levels: ";
+//      for (int64_t i = 0; i < total_levels_read; ++i) {
+//        std::cout << def_levels[i] << " ";
+//      }
+//      std::cout << std::endl;
+//    }
+//
+//    if (leaf_info_.rep_level > 0) {
+//      std::cout << "rep levels: ";
+//      for (int64_t i = 0; i < total_levels_read; ++i) {
+//        std::cout << rep_levels[i] << " ";
+//      }
+//      std::cout << std::endl;
+//    }
+//
+//    std::cout << "values: ";
+//    for (int64_t i = 0; i < this->values_written(); ++i) {
+////      std::cout << vals[i] << " ";
+//    }
+//    std::cout << std::endl;
   }
 
   void ResetValues() {
@@ -2135,6 +2135,48 @@ class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType>,
   typename EncodingTraits<ByteArrayType>::Accumulator accumulator_;
 };
 
+class LargeByteArrayChunkedRecordReader : public TypedRecordReader<LargeByteArrayType>,
+                                          virtual public LargeBinaryRecordReader {
+ public:
+  LargeByteArrayChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
+                               ::arrow::MemoryPool* pool, bool read_dense_for_nullable)
+      : TypedRecordReader<LargeByteArrayType>(descr, leaf_info, pool,
+                                         read_dense_for_nullable) {
+    ARROW_DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY);
+    accumulator_.builder = std::make_unique<::arrow::LargeBinaryBuilder>(pool);
+  }
+
+  ::arrow::ArrayVector GetBuilderChunks() override {
+    ::arrow::ArrayVector result = accumulator_.chunks;
+    if (result.size() == 0 || accumulator_.builder->length() > 0) {
+      std::shared_ptr<::arrow::Array> last_chunk;
+      PARQUET_THROW_NOT_OK(accumulator_.builder->Finish(&last_chunk));
+      result.push_back(std::move(last_chunk));
+    }
+    accumulator_.chunks = {};
+    return result;
+  }
+
+  void ReadValuesDense(int64_t values_to_read) override {
+    int64_t num_decoded = this->current_decoder_->DecodeArrowNonNull(
+        static_cast<int>(values_to_read), &accumulator_);
+    CheckNumberDecoded(num_decoded, values_to_read);
+    ResetValues();
+  }
+
+  void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override {
+    int64_t num_decoded = this->current_decoder_->DecodeArrow(
+        static_cast<int>(values_to_read), static_cast<int>(null_count),
+        valid_bits_->mutable_data(), values_written_, &accumulator_);
+    CheckNumberDecoded(num_decoded, values_to_read - null_count);
+    ResetValues();
+  }
+
+ private:
+  // Helper data structure for accumulating builder chunks
+  typename EncodingTraits<LargeByteArrayType>::Accumulator accumulator_;
+};
+
 class ByteArrayDictionaryRecordReader : public TypedRecordReader<ByteArrayType>,
                                         virtual public DictionaryRecordReader {
  public:
@@ -2217,6 +2259,88 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader<ByteArrayType>,
   std::vector<std::shared_ptr<::arrow::Array>> result_chunks_;
 };
 
+class LargeByteArrayDictionaryRecordReader : public TypedRecordReader<LargeByteArrayType>,
+                                            virtual public DictionaryRecordReader {
+ public:
+  LargeByteArrayDictionaryRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
+                                  ::arrow::MemoryPool* pool, bool read_dense_for_nullable)
+      : TypedRecordReader<LargeByteArrayType>(descr, leaf_info, pool, read_dense_for_nullable),
+        builder_(pool) {
+    this->read_dictionary_ = true;
+  }
+
+  std::shared_ptr<::arrow::ChunkedArray> GetResult() override {
+    FlushBuilder();
+    std::vector<std::shared_ptr<::arrow::Array>> result;
+    std::swap(result, result_chunks_);
+    return std::make_shared<::arrow::ChunkedArray>(std::move(result), builder_.type());
+  }
+
+  void FlushBuilder() {
+    if (builder_.length() > 0) {
+      std::shared_ptr<::arrow::Array> chunk;
+      PARQUET_THROW_NOT_OK(builder_.Finish(&chunk));
+      result_chunks_.emplace_back(std::move(chunk));
+
+      // Also clears the dictionary memo table
+      builder_.Reset();
+    }
+  }
+
+  void MaybeWriteNewDictionary() {
+    if (this->new_dictionary_) {
+      /// If there is a new dictionary, we may need to flush the builder, then
+      /// insert the new dictionary values
+      FlushBuilder();
+      builder_.ResetFull();
+      auto decoder = dynamic_cast<LargeBinaryDictDecoder*>(this->current_decoder_);
+      decoder->InsertDictionary(&builder_);
+      this->new_dictionary_ = false;
+    }
+  }
+
+  void ReadValuesDense(int64_t values_to_read) override {
+    int64_t num_decoded = 0;
+    if (current_encoding_ == Encoding::RLE_DICTIONARY) {
+      MaybeWriteNewDictionary();
+      auto decoder = dynamic_cast<LargeBinaryDictDecoder*>(this->current_decoder_);
+      num_decoded = decoder->DecodeIndices(static_cast<int>(values_to_read), &builder_);
+    } else {
+      num_decoded = this->current_decoder_->DecodeArrowNonNull(
+          static_cast<int>(values_to_read), &builder_);
+
+      /// Flush values since they have been copied into the builder
+      ResetValues();
+    }
+    CheckNumberDecoded(num_decoded, values_to_read);
+  }
+
+  void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override {
+    int64_t num_decoded = 0;
+    if (current_encoding_ == Encoding::RLE_DICTIONARY) {
+      MaybeWriteNewDictionary();
+      auto decoder = dynamic_cast<LargeBinaryDictDecoder*>(this->current_decoder_);
+      num_decoded = decoder->DecodeIndicesSpaced(
+          static_cast<int>(values_to_read), static_cast<int>(null_count),
+          valid_bits_->mutable_data(), values_written_, &builder_);
+    } else {
+      num_decoded = this->current_decoder_->DecodeArrow(
+          static_cast<int>(values_to_read), static_cast<int>(null_count),
+          valid_bits_->mutable_data(), values_written_, &builder_);
+
+      /// Flush values since they have been copied into the builder
+      ResetValues();
+    }
+    ARROW_DCHECK_EQ(num_decoded, values_to_read - null_count);
+  }
+
+ private:
+  using LargeBinaryDictDecoder = DictDecoder<LargeByteArrayType>;
+
+  ::arrow::LargeBinaryDictionary32Builder builder_;
+  std::vector<std::shared_ptr<::arrow::Array>> result_chunks_;
+};
+
 // TODO(wesm): Implement these to some satisfaction
 template <>
 void TypedRecordReader<Int96Type>::DebugPrintState() {}
@@ -2241,12 +2365,27 @@ std::shared_ptr<RecordReader> MakeByteArrayRecordReader(const ColumnDescriptor*
   }
 }
 
+std::shared_ptr<RecordReader> MakeLargeByteArrayRecordReader(const ColumnDescriptor* descr,
+                                                             LevelInfo leaf_info,
+                                                             ::arrow::MemoryPool* pool,
+                                                             bool read_dictionary,
+                                                             bool read_dense_for_nullable) {
+  if (read_dictionary) {
+    return std::make_shared<LargeByteArrayDictionaryRecordReader>(descr, leaf_info, pool,
+                                                             read_dense_for_nullable);
+  } else {
+    return std::make_shared<LargeByteArrayChunkedRecordReader>(
+        descr, leaf_info, pool, read_dense_for_nullable);
+  }
+}
+
 }  // namespace
 
 std::shared_ptr<RecordReader> RecordReader::Make(const ColumnDescriptor* descr,
                                                  LevelInfo leaf_info, MemoryPool* pool,
                                                  bool read_dictionary,
-                                                 bool read_dense_for_nullable) {
+                                                 bool read_dense_for_nullable,
+                                                 bool use_binary_string_large_variants) {
   switch (descr->physical_type()) {
     case Type::BOOLEAN:
       return std::make_shared<TypedRecordReader<BooleanType>>(descr, leaf_info, pool,
@@ -2267,8 +2406,10 @@ std::shared_ptr<RecordReader> RecordReader::Make(const ColumnDescriptor* descr,
       return std::make_shared<TypedRecordReader<DoubleType>>(descr, leaf_info, pool,
                                                              read_dense_for_nullable);
     case Type::BYTE_ARRAY: {
-      return MakeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary,
-                                       read_dense_for_nullable);
+      return use_binary_string_large_variants ? MakeLargeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary,
+                                                                          read_dense_for_nullable)
+                                              : MakeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary,
+                                                                          read_dense_for_nullable);
     }
     case Type::FIXED_LEN_BYTE_ARRAY:
       return std::make_shared<FLBARecordReader>(descr, leaf_info, pool,
-Original file line number
+Diff line change
@@ Expand Up / @@ -678,6 +678,8 @@ class ARROW_EXPORT BaseBinaryType : public DataType { @@
     constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max() - 1;
+    constexpr int64_t kLargeBinaryMemoryLimit = std::numeric_limits<int64_t>::max() - 1;
     /// \addtogroup binary-datatypes
     ///
     /// @{
@@ Expand Down @@