Skip to content

Commit

Permalink
apacheGH-38074: [C++] Fix Offset Size Calculation for Slicing Large S…
Browse files Browse the repository at this point in the history
…tring and Binary Types in Hash Join (apache#38147)

### Rationale for this change

We found that the wrong results in inner joins during hash join operations were caused by a problem with how large strings and binary types were handled. The `Slice` function was not calculating their sizes correctly.

To fix this, I changed the `Slice` function to calculate the sizes correctly, based on the type of data for large string and binary. 

* Issue raised: apache#37729 

### What changes are included in this PR?

* The `Slice` function has been updated to correctly calculate the offset for Large String and Large Binary types, and assertion statements have been added to improve maintainability.
* Unit tests (`TEST(KeyColumnArray, SliceBinaryTest)`)for the Slice function have been added. 
* During random tests for Hash Join (`TEST(HashJoin, Random)`), modifications were made to allow the creation of Large String as key column values.

### Are these changes tested?

Yes

### Are there any user-facing changes?

Acero might not have a large user base as it is an experimental feature, but I deemed the issue of incorrect join results as critical and have addressed the bug.

* Closes: apache#38074

Authored-by: Hyunseok Seo <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
llama90 authored and dgreiss committed Feb 17, 2024
1 parent 3a0f6f3 commit 649b4ca
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 8 deletions.
27 changes: 23 additions & 4 deletions cpp/src/arrow/acero/hash_join_node_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,8 @@ struct RandomDataTypeConstraints {
int max_string_length;

void Default() {
data_type_enabled_mask = kInt1 | kInt2 | kInt4 | kInt8 | kBool | kBinary | kString;
data_type_enabled_mask =
kInt1 | kInt2 | kInt4 | kInt8 | kBool | kBinary | kString | kLargeString;
min_null_probability = 0.0;
max_null_probability = 0.2;
min_binary_length = 1;
Expand Down Expand Up @@ -289,6 +290,7 @@ struct RandomDataTypeConstraints {
static constexpr int64_t kBool = 16;
static constexpr int64_t kBinary = 32;
static constexpr int64_t kString = 64;
static constexpr int64_t kLargeString = 128;
};

struct RandomDataType {
Expand All @@ -297,6 +299,7 @@ struct RandomDataType {
int fixed_length;
int min_string_length;
int max_string_length;
bool is_large_string;

static RandomDataType Random(Random64Bit& rng,
const RandomDataTypeConstraints& constraints) {
Expand All @@ -312,6 +315,15 @@ struct RandomDataType {
} else {
result.is_fixed_length = true;
}

if (!result.is_fixed_length &&
(constraints.data_type_enabled_mask & constraints.kLargeString) != 0) {
// When selecting the string type, there's a 50% chance of choosing a large string.
result.is_large_string = ((rng.next() % 2) == 0);
} else {
result.is_large_string = false;
}

if (constraints.max_null_probability > 0.0) {
// 25% chance of no nulls
// Uniform distribution of null probability from min to max
Expand Down Expand Up @@ -405,9 +417,16 @@ std::vector<std::shared_ptr<Array>> GenRandomRecords(
break;
}
} else {
result.push_back(rag.String(num_rows, data_types[i].min_string_length,
data_types[i].max_string_length,
data_types[i].null_probability));
if (data_types[i].is_large_string) {
// Generate LargeString if is_large_string flag is true
result.push_back(rag.LargeString(num_rows, data_types[i].min_string_length,
data_types[i].max_string_length,
data_types[i].null_probability));
} else {
result.push_back(rag.String(num_rows, data_types[i].min_string_length,
data_types[i].max_string_length,
data_types[i].null_probability));
}
}
}
return result;
Expand Down
12 changes: 8 additions & 4 deletions cpp/src/arrow/compute/light_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -80,27 +80,31 @@ KeyColumnArray KeyColumnArray::Slice(int64_t offset, int64_t length) const {
KeyColumnArray sliced;
sliced.metadata_ = metadata_;
sliced.length_ = length;
uint32_t fixed_size =
!metadata_.is_fixed_length ? sizeof(uint32_t) : metadata_.fixed_length;
uint32_t fixed_size = metadata_.fixed_length;

sliced.buffers_[0] =
buffers_[0] ? buffers_[0] + (bit_offset_[0] + offset) / 8 : nullptr;
sliced.mutable_buffers_[0] =
mutable_buffers_[0] ? mutable_buffers_[0] + (bit_offset_[0] + offset) / 8 : nullptr;
sliced.bit_offset_[0] = (bit_offset_[0] + offset) % 8;

if (fixed_size == 0 && !metadata_.is_null_type) {
if (metadata_.fixed_length == 0 && !metadata_.is_null_type) {
ARROW_DCHECK(is_bool_type()) << "Expected BOOL type type but got a different type.";
sliced.buffers_[1] =
buffers_[1] ? buffers_[1] + (bit_offset_[1] + offset) / 8 : nullptr;
sliced.mutable_buffers_[1] = mutable_buffers_[1]
? mutable_buffers_[1] + (bit_offset_[1] + offset) / 8
: nullptr;
sliced.bit_offset_[1] = (bit_offset_[1] + offset) % 8;
} else {
} else if (metadata_.fixed_length > 0) {
ARROW_DCHECK(is_binary_type() || is_large_binary_type() || is_fixed_width_types())
<< "Expected (LARGE) BINARY or FIXED WIDTH type but got a different type.";
sliced.buffers_[1] = buffers_[1] ? buffers_[1] + offset * fixed_size : nullptr;
sliced.mutable_buffers_[1] =
mutable_buffers_[1] ? mutable_buffers_[1] + offset * fixed_size : nullptr;
sliced.bit_offset_[1] = 0;
} else {
ARROW_DCHECK(is_null_type()) << "Expected Null type but got a different type.";
}

sliced.buffers_[2] = buffers_[2];
Expand Down
25 changes: 25 additions & 0 deletions cpp/src/arrow/compute/light_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,31 @@ class ARROW_EXPORT KeyColumnArray {
// Starting bit offset within the first byte (between 0 and 7)
// to be used when accessing buffers that store bit vectors.
int bit_offset_[kMaxBuffers - 1];

bool is_bool_type() const {
return metadata_.is_fixed_length && metadata_.fixed_length == 0 &&
!metadata_.is_null_type;
}

bool is_fixed_width_types() const {
return metadata_.is_fixed_length && metadata_.fixed_length != 0 &&
!metadata_.is_null_type;
}

bool is_binary_type() const {
return !metadata_.is_fixed_length && metadata_.fixed_length == sizeof(uint32_t) &&
!metadata_.is_null_type;
}

bool is_large_binary_type() const {
return !metadata_.is_fixed_length && metadata_.fixed_length == sizeof(uint64_t) &&
!metadata_.is_null_type;
}

bool is_null_type() const {
return metadata_.is_fixed_length && metadata_.fixed_length == 0 &&
metadata_.is_null_type;
}
};

/// \brief Create KeyColumnMetadata from a DataType
Expand Down
60 changes: 60 additions & 0 deletions cpp/src/arrow/compute/light_array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,66 @@ TEST(KeyColumnArray, SliceBool) {
}
}

struct SliceTestCase {
int offset;
int length;
std::vector<std::string> expected;
};

template <typename OffsetType>
void GenericTestSlice(const std::shared_ptr<DataType>& type, const char* json_data,
const std::vector<SliceTestCase>& testCases) {
auto array = ArrayFromJSON(type, json_data);
KeyColumnArray kc_array =
ColumnArrayFromArrayData(array->data(), 0, array->length()).ValueOrDie();

for (const auto& testCase : testCases) {
ARROW_SCOPED_TRACE("Offset: ", testCase.offset, " Length: ", testCase.length);
KeyColumnArray sliced = kc_array.Slice(testCase.offset, testCase.length);

// Extract binary data from the sliced KeyColumnArray
std::vector<std::string> sliced_data;
const auto* offset_data = reinterpret_cast<const OffsetType*>(sliced.data(1));
const auto* string_data = reinterpret_cast<const char*>(sliced.data(2));

for (auto i = 0; i < testCase.length; ++i) {
auto start = offset_data[i];
auto end = offset_data[i + 1];
sliced_data.push_back(std::string(string_data + start, string_data + end));
}

// Compare the sliced values to the expected string
ASSERT_EQ(testCase.expected, sliced_data);
}
}

TEST(KeyColumnArray, SliceBinaryTest) {
const char* json_test_strings = R"(["Hello", "World", "Slice", "Binary", "Test"])";
std::vector<SliceTestCase> testCases = {
{0, 1, {"Hello"}},
{1, 1, {"World"}},
{2, 1, {"Slice"}},
{3, 1, {"Binary"}},
{4, 1, {"Test"}},
{0, 2, {"Hello", "World"}},
{1, 2, {"World", "Slice"}},
{2, 2, {"Slice", "Binary"}},
{3, 2, {"Binary", "Test"}},
{0, 3, {"Hello", "World", "Slice"}},
{1, 3, {"World", "Slice", "Binary"}},
{2, 3, {"Slice", "Binary", "Test"}},
{0, 4, {"Hello", "World", "Slice", "Binary"}},
{1, 4, {"World", "Slice", "Binary", "Test"}},
{0, 5, {"Hello", "World", "Slice", "Binary", "Test"}},
};

// Run tests with binary type
GenericTestSlice<int32_t>(binary(), json_test_strings, testCases);

// Run tests with large binary type
GenericTestSlice<int64_t>(large_binary(), json_test_strings, testCases);
}

TEST(ResizableArrayData, Basic) {
std::unique_ptr<MemoryPool> pool = MemoryPool::CreateDefault();
for (const auto& type : kSampleFixedDataTypes) {
Expand Down

0 comments on commit 649b4ca

Please sign in to comment.