From bb3fe3f3eb2bb81de0eaae1240da256cefc04051 Mon Sep 17 00:00:00 2001 From: Hyunseok Seo Date: Mon, 16 Oct 2023 17:57:09 +0900 Subject: [PATCH] GH-38074: [C++] Fix Offset Size Calculation for Slicing Large String and Binary Types in Hash Join (#38147) ### Rationale for this change We found that the wrong results in inner joins during hash join operations were caused by a problem with how large strings and binary types were handled. The `Slice` function was not calculating their sizes correctly. To fix this, I changed the `Slice` function to calculate the sizes correctly, based on the type of data for large string and binary. * Issue raised: #37729 ### What changes are included in this PR? * The `Slice` function has been updated to correctly calculate the offset for Large String and Large Binary types, and assertion statements have been added to improve maintainability. * Unit tests (`TEST(KeyColumnArray, SliceBinaryTest)`)for the Slice function have been added. * During random tests for Hash Join (`TEST(HashJoin, Random)`), modifications were made to allow the creation of Large String as key column values. ### Are these changes tested? Yes ### Are there any user-facing changes? Acero might not have a large user base as it is an experimental feature, but I deemed the issue of incorrect join results as critical and have addressed the bug. * Closes: #38074 Authored-by: Hyunseok Seo Signed-off-by: Antoine Pitrou --- cpp/src/arrow/acero/hash_join_node_test.cc | 27 ++++++++-- cpp/src/arrow/compute/light_array.cc | 12 +++-- cpp/src/arrow/compute/light_array.h | 25 +++++++++ cpp/src/arrow/compute/light_array_test.cc | 60 ++++++++++++++++++++++ 4 files changed, 116 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 960d3838fb8fd..58551f4eca00a 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -253,7 +253,8 @@ struct RandomDataTypeConstraints { int max_string_length; void Default() { - data_type_enabled_mask = kInt1 | kInt2 | kInt4 | kInt8 | kBool | kBinary | kString; + data_type_enabled_mask = + kInt1 | kInt2 | kInt4 | kInt8 | kBool | kBinary | kString | kLargeString; min_null_probability = 0.0; max_null_probability = 0.2; min_binary_length = 1; @@ -289,6 +290,7 @@ struct RandomDataTypeConstraints { static constexpr int64_t kBool = 16; static constexpr int64_t kBinary = 32; static constexpr int64_t kString = 64; + static constexpr int64_t kLargeString = 128; }; struct RandomDataType { @@ -297,6 +299,7 @@ struct RandomDataType { int fixed_length; int min_string_length; int max_string_length; + bool is_large_string; static RandomDataType Random(Random64Bit& rng, const RandomDataTypeConstraints& constraints) { @@ -312,6 +315,15 @@ struct RandomDataType { } else { result.is_fixed_length = true; } + + if (!result.is_fixed_length && + (constraints.data_type_enabled_mask & constraints.kLargeString) != 0) { + // When selecting the string type, there's a 50% chance of choosing a large string. + result.is_large_string = ((rng.next() % 2) == 0); + } else { + result.is_large_string = false; + } + if (constraints.max_null_probability > 0.0) { // 25% chance of no nulls // Uniform distribution of null probability from min to max @@ -405,9 +417,16 @@ std::vector> GenRandomRecords( break; } } else { - result.push_back(rag.String(num_rows, data_types[i].min_string_length, - data_types[i].max_string_length, - data_types[i].null_probability)); + if (data_types[i].is_large_string) { + // Generate LargeString if is_large_string flag is true + result.push_back(rag.LargeString(num_rows, data_types[i].min_string_length, + data_types[i].max_string_length, + data_types[i].null_probability)); + } else { + result.push_back(rag.String(num_rows, data_types[i].min_string_length, + data_types[i].max_string_length, + data_types[i].null_probability)); + } } } return result; diff --git a/cpp/src/arrow/compute/light_array.cc b/cpp/src/arrow/compute/light_array.cc index 37d4421fd79f7..4e8b2b2d7cc3a 100644 --- a/cpp/src/arrow/compute/light_array.cc +++ b/cpp/src/arrow/compute/light_array.cc @@ -80,8 +80,7 @@ KeyColumnArray KeyColumnArray::Slice(int64_t offset, int64_t length) const { KeyColumnArray sliced; sliced.metadata_ = metadata_; sliced.length_ = length; - uint32_t fixed_size = - !metadata_.is_fixed_length ? sizeof(uint32_t) : metadata_.fixed_length; + uint32_t fixed_size = metadata_.fixed_length; sliced.buffers_[0] = buffers_[0] ? buffers_[0] + (bit_offset_[0] + offset) / 8 : nullptr; @@ -89,18 +88,23 @@ KeyColumnArray KeyColumnArray::Slice(int64_t offset, int64_t length) const { mutable_buffers_[0] ? mutable_buffers_[0] + (bit_offset_[0] + offset) / 8 : nullptr; sliced.bit_offset_[0] = (bit_offset_[0] + offset) % 8; - if (fixed_size == 0 && !metadata_.is_null_type) { + if (metadata_.fixed_length == 0 && !metadata_.is_null_type) { + ARROW_DCHECK(is_bool_type()) << "Expected BOOL type type but got a different type."; sliced.buffers_[1] = buffers_[1] ? buffers_[1] + (bit_offset_[1] + offset) / 8 : nullptr; sliced.mutable_buffers_[1] = mutable_buffers_[1] ? mutable_buffers_[1] + (bit_offset_[1] + offset) / 8 : nullptr; sliced.bit_offset_[1] = (bit_offset_[1] + offset) % 8; - } else { + } else if (metadata_.fixed_length > 0) { + ARROW_DCHECK(is_binary_type() || is_large_binary_type() || is_fixed_width_types()) + << "Expected (LARGE) BINARY or FIXED WIDTH type but got a different type."; sliced.buffers_[1] = buffers_[1] ? buffers_[1] + offset * fixed_size : nullptr; sliced.mutable_buffers_[1] = mutable_buffers_[1] ? mutable_buffers_[1] + offset * fixed_size : nullptr; sliced.bit_offset_[1] = 0; + } else { + ARROW_DCHECK(is_null_type()) << "Expected Null type but got a different type."; } sliced.buffers_[2] = buffers_[2]; diff --git a/cpp/src/arrow/compute/light_array.h b/cpp/src/arrow/compute/light_array.h index 3dd328c22eedf..87f6b6c76a12c 100644 --- a/cpp/src/arrow/compute/light_array.h +++ b/cpp/src/arrow/compute/light_array.h @@ -183,6 +183,31 @@ class ARROW_EXPORT KeyColumnArray { // Starting bit offset within the first byte (between 0 and 7) // to be used when accessing buffers that store bit vectors. int bit_offset_[kMaxBuffers - 1]; + + bool is_bool_type() const { + return metadata_.is_fixed_length && metadata_.fixed_length == 0 && + !metadata_.is_null_type; + } + + bool is_fixed_width_types() const { + return metadata_.is_fixed_length && metadata_.fixed_length != 0 && + !metadata_.is_null_type; + } + + bool is_binary_type() const { + return !metadata_.is_fixed_length && metadata_.fixed_length == sizeof(uint32_t) && + !metadata_.is_null_type; + } + + bool is_large_binary_type() const { + return !metadata_.is_fixed_length && metadata_.fixed_length == sizeof(uint64_t) && + !metadata_.is_null_type; + } + + bool is_null_type() const { + return metadata_.is_fixed_length && metadata_.fixed_length == 0 && + metadata_.is_null_type; + } }; /// \brief Create KeyColumnMetadata from a DataType diff --git a/cpp/src/arrow/compute/light_array_test.cc b/cpp/src/arrow/compute/light_array_test.cc index 71d228bf3f614..4e33f7b578ea8 100644 --- a/cpp/src/arrow/compute/light_array_test.cc +++ b/cpp/src/arrow/compute/light_array_test.cc @@ -226,6 +226,66 @@ TEST(KeyColumnArray, SliceBool) { } } +struct SliceTestCase { + int offset; + int length; + std::vector expected; +}; + +template +void GenericTestSlice(const std::shared_ptr& type, const char* json_data, + const std::vector& testCases) { + auto array = ArrayFromJSON(type, json_data); + KeyColumnArray kc_array = + ColumnArrayFromArrayData(array->data(), 0, array->length()).ValueOrDie(); + + for (const auto& testCase : testCases) { + ARROW_SCOPED_TRACE("Offset: ", testCase.offset, " Length: ", testCase.length); + KeyColumnArray sliced = kc_array.Slice(testCase.offset, testCase.length); + + // Extract binary data from the sliced KeyColumnArray + std::vector sliced_data; + const auto* offset_data = reinterpret_cast(sliced.data(1)); + const auto* string_data = reinterpret_cast(sliced.data(2)); + + for (auto i = 0; i < testCase.length; ++i) { + auto start = offset_data[i]; + auto end = offset_data[i + 1]; + sliced_data.push_back(std::string(string_data + start, string_data + end)); + } + + // Compare the sliced values to the expected string + ASSERT_EQ(testCase.expected, sliced_data); + } +} + +TEST(KeyColumnArray, SliceBinaryTest) { + const char* json_test_strings = R"(["Hello", "World", "Slice", "Binary", "Test"])"; + std::vector testCases = { + {0, 1, {"Hello"}}, + {1, 1, {"World"}}, + {2, 1, {"Slice"}}, + {3, 1, {"Binary"}}, + {4, 1, {"Test"}}, + {0, 2, {"Hello", "World"}}, + {1, 2, {"World", "Slice"}}, + {2, 2, {"Slice", "Binary"}}, + {3, 2, {"Binary", "Test"}}, + {0, 3, {"Hello", "World", "Slice"}}, + {1, 3, {"World", "Slice", "Binary"}}, + {2, 3, {"Slice", "Binary", "Test"}}, + {0, 4, {"Hello", "World", "Slice", "Binary"}}, + {1, 4, {"World", "Slice", "Binary", "Test"}}, + {0, 5, {"Hello", "World", "Slice", "Binary", "Test"}}, + }; + + // Run tests with binary type + GenericTestSlice(binary(), json_test_strings, testCases); + + // Run tests with large binary type + GenericTestSlice(large_binary(), json_test_strings, testCases); +} + TEST(ResizableArrayData, Basic) { std::unique_ptr pool = MemoryPool::CreateDefault(); for (const auto& type : kSampleFixedDataTypes) {