From bb3fe3f3eb2bb81de0eaae1240da256cefc04051 Mon Sep 17 00:00:00 2001
From: Hyunseok Seo <hsseo0501@gmail.com>
Date: Mon, 16 Oct 2023 17:57:09 +0900
Subject: [PATCH] GH-38074: [C++] Fix Offset Size Calculation for Slicing Large
 String and Binary Types in Hash Join (#38147)

### Rationale for this change

We found that the wrong results in inner joins during hash join operations were caused by a problem with how large strings and binary types were handled. The `Slice` function was not calculating their sizes correctly.

To fix this, I changed the `Slice` function to calculate the sizes correctly, based on the type of data for large string and binary.

* Issue raised: #37729

### What changes are included in this PR?

* The `Slice` function has been updated to correctly calculate the offset for Large String and Large Binary types, and assertion statements have been added to improve maintainability.
* Unit tests (`TEST(KeyColumnArray, SliceBinaryTest)`)for the Slice function have been added.
* During random tests for Hash Join (`TEST(HashJoin, Random)`), modifications were made to allow the creation of Large String as key column values.

### Are these changes tested?

Yes

### Are there any user-facing changes?

Acero might not have a large user base as it is an experimental feature, but I deemed the issue of incorrect join results as critical and have addressed the bug.

* Closes: #38074

Authored-by: Hyunseok Seo <hsseo0501@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/acero/hash_join_node_test.cc | 27 ++++++++--
 cpp/src/arrow/compute/light_array.cc       | 12 +++--
 cpp/src/arrow/compute/light_array.h        | 25 +++++++++
 cpp/src/arrow/compute/light_array_test.cc  | 60 ++++++++++++++++++++++
 4 files changed, 116 insertions(+), 8 deletions(-)
diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc
index 960d3838fb8fd..58551f4eca00a 100644
--- a/cpp/src/arrow/acero/hash_join_node_test.cc
+++ b/cpp/src/arrow/acero/hash_join_node_test.cc
@@ -253,7 +253,8 @@ struct RandomDataTypeConstraints {
   int max_string_length;
 
   void Default() {
-    data_type_enabled_mask = kInt1 | kInt2 | kInt4 | kInt8 | kBool | kBinary | kString;
+    data_type_enabled_mask =
+        kInt1 | kInt2 | kInt4 | kInt8 | kBool | kBinary | kString | kLargeString;
     min_null_probability = 0.0;
     max_null_probability = 0.2;
     min_binary_length = 1;
@@ -289,6 +290,7 @@ struct RandomDataTypeConstraints {
   static constexpr int64_t kBool = 16;
   static constexpr int64_t kBinary = 32;
   static constexpr int64_t kString = 64;
+  static constexpr int64_t kLargeString = 128;
 };
 
 struct RandomDataType {
@@ -297,6 +299,7 @@ struct RandomDataType {
   int fixed_length;
   int min_string_length;
   int max_string_length;
+  bool is_large_string;
 
   static RandomDataType Random(Random64Bit& rng,
                                const RandomDataTypeConstraints& constraints) {
@@ -312,6 +315,15 @@ struct RandomDataType {
     } else {
       result.is_fixed_length = true;
     }
+
+    if (!result.is_fixed_length &&
+        (constraints.data_type_enabled_mask & constraints.kLargeString) != 0) {
+      // When selecting the string type, there's a 50% chance of choosing a large string.
+      result.is_large_string = ((rng.next() % 2) == 0);
+    } else {
+      result.is_large_string = false;
+    }
+
     if (constraints.max_null_probability > 0.0) {
       // 25% chance of no nulls
       // Uniform distribution of null probability from min to max
@@ -405,9 +417,16 @@ std::vector<std::shared_ptr<Array>> GenRandomRecords(
           break;
       }
     } else {
-      result.push_back(rag.String(num_rows, data_types[i].min_string_length,
-                                  data_types[i].max_string_length,
-                                  data_types[i].null_probability));
+      if (data_types[i].is_large_string) {
+        // Generate LargeString if is_large_string flag is true
+        result.push_back(rag.LargeString(num_rows, data_types[i].min_string_length,
+                                         data_types[i].max_string_length,
+                                         data_types[i].null_probability));
+      } else {
+        result.push_back(rag.String(num_rows, data_types[i].min_string_length,
+                                    data_types[i].max_string_length,
+                                    data_types[i].null_probability));
+      }
     }
   }
   return result;
diff --git a/cpp/src/arrow/compute/light_array.cc b/cpp/src/arrow/compute/light_array.cc
index 37d4421fd79f7..4e8b2b2d7cc3a 100644
--- a/cpp/src/arrow/compute/light_array.cc
+++ b/cpp/src/arrow/compute/light_array.cc
@@ -80,8 +80,7 @@ KeyColumnArray KeyColumnArray::Slice(int64_t offset, int64_t length) const {
   KeyColumnArray sliced;
   sliced.metadata_ = metadata_;
   sliced.length_ = length;
-  uint32_t fixed_size =
-      !metadata_.is_fixed_length ? sizeof(uint32_t) : metadata_.fixed_length;
+  uint32_t fixed_size = metadata_.fixed_length;
 
   sliced.buffers_[0] =
       buffers_[0] ? buffers_[0] + (bit_offset_[0] + offset) / 8 : nullptr;
@@ -89,18 +88,23 @@ KeyColumnArray KeyColumnArray::Slice(int64_t offset, int64_t length) const {
       mutable_buffers_[0] ? mutable_buffers_[0] + (bit_offset_[0] + offset) / 8 : nullptr;
   sliced.bit_offset_[0] = (bit_offset_[0] + offset) % 8;
 
-  if (fixed_size == 0 && !metadata_.is_null_type) {
+  if (metadata_.fixed_length == 0 && !metadata_.is_null_type) {
+    ARROW_DCHECK(is_bool_type()) << "Expected BOOL type type but got a different type.";
     sliced.buffers_[1] =
         buffers_[1] ? buffers_[1] + (bit_offset_[1] + offset) / 8 : nullptr;
     sliced.mutable_buffers_[1] = mutable_buffers_[1]
                                      ? mutable_buffers_[1] + (bit_offset_[1] + offset) / 8
                                      : nullptr;
     sliced.bit_offset_[1] = (bit_offset_[1] + offset) % 8;
-  } else {
+  } else if (metadata_.fixed_length > 0) {
+    ARROW_DCHECK(is_binary_type() || is_large_binary_type() || is_fixed_width_types())
+        << "Expected (LARGE) BINARY or FIXED WIDTH type but got a different type.";
     sliced.buffers_[1] = buffers_[1] ? buffers_[1] + offset * fixed_size : nullptr;
     sliced.mutable_buffers_[1] =
         mutable_buffers_[1] ? mutable_buffers_[1] + offset * fixed_size : nullptr;
     sliced.bit_offset_[1] = 0;
+  } else {
+    ARROW_DCHECK(is_null_type()) << "Expected Null type but got a different type.";
   }
 
   sliced.buffers_[2] = buffers_[2];
diff --git a/cpp/src/arrow/compute/light_array.h b/cpp/src/arrow/compute/light_array.h
index 3dd328c22eedf..87f6b6c76a12c 100644
--- a/cpp/src/arrow/compute/light_array.h
+++ b/cpp/src/arrow/compute/light_array.h
@@ -183,6 +183,31 @@ class ARROW_EXPORT KeyColumnArray {
   // Starting bit offset within the first byte (between 0 and 7)
   // to be used when accessing buffers that store bit vectors.
   int bit_offset_[kMaxBuffers - 1];
+
+  bool is_bool_type() const {
+    return metadata_.is_fixed_length && metadata_.fixed_length == 0 &&
+           !metadata_.is_null_type;
+  }
+
+  bool is_fixed_width_types() const {
+    return metadata_.is_fixed_length && metadata_.fixed_length != 0 &&
+           !metadata_.is_null_type;
+  }
+
+  bool is_binary_type() const {
+    return !metadata_.is_fixed_length && metadata_.fixed_length == sizeof(uint32_t) &&
+           !metadata_.is_null_type;
+  }
+
+  bool is_large_binary_type() const {
+    return !metadata_.is_fixed_length && metadata_.fixed_length == sizeof(uint64_t) &&
+           !metadata_.is_null_type;
+  }
+
+  bool is_null_type() const {
+    return metadata_.is_fixed_length && metadata_.fixed_length == 0 &&
+           metadata_.is_null_type;
+  }
 };
 
 /// \brief Create KeyColumnMetadata from a DataType
diff --git a/cpp/src/arrow/compute/light_array_test.cc b/cpp/src/arrow/compute/light_array_test.cc
index 71d228bf3f614..4e33f7b578ea8 100644
--- a/cpp/src/arrow/compute/light_array_test.cc
+++ b/cpp/src/arrow/compute/light_array_test.cc
@@ -226,6 +226,66 @@ TEST(KeyColumnArray, SliceBool) {
   }
 }
 
+struct SliceTestCase {
+  int offset;
+  int length;
+  std::vector<std::string> expected;
+};
+
+template <typename OffsetType>
+void GenericTestSlice(const std::shared_ptr<DataType>& type, const char* json_data,
+                      const std::vector<SliceTestCase>& testCases) {
+  auto array = ArrayFromJSON(type, json_data);
+  KeyColumnArray kc_array =
+      ColumnArrayFromArrayData(array->data(), 0, array->length()).ValueOrDie();
+
+  for (const auto& testCase : testCases) {
+    ARROW_SCOPED_TRACE("Offset: ", testCase.offset, " Length: ", testCase.length);
+    KeyColumnArray sliced = kc_array.Slice(testCase.offset, testCase.length);
+
+    // Extract binary data from the sliced KeyColumnArray
+    std::vector<std::string> sliced_data;
+    const auto* offset_data = reinterpret_cast<const OffsetType*>(sliced.data(1));
+    const auto* string_data = reinterpret_cast<const char*>(sliced.data(2));
+
+    for (auto i = 0; i < testCase.length; ++i) {
+      auto start = offset_data[i];
+      auto end = offset_data[i + 1];
+      sliced_data.push_back(std::string(string_data + start, string_data + end));
+    }
+
+    // Compare the sliced values to the expected string
+    ASSERT_EQ(testCase.expected, sliced_data);
+  }
+}
+
+TEST(KeyColumnArray, SliceBinaryTest) {
+  const char* json_test_strings = R"(["Hello", "World", "Slice", "Binary", "Test"])";
+  std::vector<SliceTestCase> testCases = {
+      {0, 1, {"Hello"}},
+      {1, 1, {"World"}},
+      {2, 1, {"Slice"}},
+      {3, 1, {"Binary"}},
+      {4, 1, {"Test"}},
+      {0, 2, {"Hello", "World"}},
+      {1, 2, {"World", "Slice"}},
+      {2, 2, {"Slice", "Binary"}},
+      {3, 2, {"Binary", "Test"}},
+      {0, 3, {"Hello", "World", "Slice"}},
+      {1, 3, {"World", "Slice", "Binary"}},
+      {2, 3, {"Slice", "Binary", "Test"}},
+      {0, 4, {"Hello", "World", "Slice", "Binary"}},
+      {1, 4, {"World", "Slice", "Binary", "Test"}},
+      {0, 5, {"Hello", "World", "Slice", "Binary", "Test"}},
+  };
+
+  // Run tests with binary type
+  GenericTestSlice<int32_t>(binary(), json_test_strings, testCases);
+
+  // Run tests with large binary type
+  GenericTestSlice<int64_t>(large_binary(), json_test_strings, testCases);
+}
+
 TEST(ResizableArrayData, Basic) {
   std::unique_ptr<MemoryPool> pool = MemoryPool::CreateDefault();
   for (const auto& type : kSampleFixedDataTypes) {