From 9beb83e51c815f94b2d95e23cbb5389fc04a81c0 Mon Sep 17 00:00:00 2001 From: willsfeng Date: Wed, 27 Mar 2024 09:50:34 -0700 Subject: [PATCH] address comments --- velox/functions/prestosql/StringFunctions.h | 67 +++++++++++-------- .../prestosql/tests/StringFunctionsTest.cpp | 6 +- 2 files changed, 41 insertions(+), 32 deletions(-) diff --git a/velox/functions/prestosql/StringFunctions.h b/velox/functions/prestosql/StringFunctions.h index 9b2092c87910..74b8aafb191a 100644 --- a/velox/functions/prestosql/StringFunctions.h +++ b/velox/functions/prestosql/StringFunctions.h @@ -347,48 +347,61 @@ template struct HammingDistanceFunction { VELOX_DEFINE_FUNCTION_TYPES(T); - template - void doCall( + void call( out_type& result, - const TCodePoint* leftCodePoints, - const TCodePoint* rightCodePoints, - size_t leftCodePointsSize, - size_t rightCodePointsSize) { - VELOX_USER_CHECK( - leftCodePointsSize == rightCodePointsSize, - "The input strings to hamming_distance function must have the same length"); + const arg_type& left, + const arg_type& right) { + int64_t leftLength = left.size(); + int64_t rightLength = right.size(); int64_t distance = 0; - for (int i = 0; i < leftCodePointsSize; i++) { - if (leftCodePoints[i] != rightCodePoints[i]) { + int64_t leftPosition = 0; + int64_t rightPosition = 0; + while (leftPosition < leftLength && rightPosition < rightLength) { + int leftSize = 0; + int rightSize = 0; + auto codePointLeft = utf8proc_codepoint( + left.data() + leftPosition, left.data() + leftLength, leftSize); + auto codePointRight = utf8proc_codepoint( + right.data() + rightPosition, right.data() + rightLength, rightSize); + + // if both code points are invalid, we do not care if they are equal + // the following code treats them as equal if they happen to be of the + // same length + leftPosition += codePointLeft > 0 ? leftSize : -codePointLeft; + rightPosition += codePointRight > 0 ? rightSize : -codePointRight; + + if (codePointLeft != codePointRight) { distance++; } } - result = distance; - } + VELOX_USER_CHECK( + leftPosition == leftLength && rightPosition == rightLength, + "The input strings to hamming_distance function must have the same length"); - void call( - out_type& result, - const arg_type& left, - const arg_type& right) { - auto leftCodePoints = stringImpl::stringToCodePoints(left); - auto rightCodePoints = stringImpl::stringToCodePoints(right); - doCall( - result, - leftCodePoints.data(), - rightCodePoints.data(), - leftCodePoints.size(), - rightCodePoints.size()); + result = distance; } void callAscii( out_type& result, const arg_type& left, const arg_type& right) { + int64_t leftLength = left.size(); + int64_t rightLength = right.size(); + VELOX_USER_CHECK_EQ( + leftLength, + rightLength, + "The input strings to hamming_distance function must have the same length"); + auto leftCodePoints = reinterpret_cast(left.data()); auto rightCodePoints = reinterpret_cast(right.data()); - doCall( - result, leftCodePoints, rightCodePoints, left.size(), right.size()); + int64_t distance = 0; + for (int i = 0; i < leftLength; i++) { + if (leftCodePoints[i] != rightCodePoints[i]) { + distance++; + } + } + result = distance; } }; diff --git a/velox/functions/prestosql/tests/StringFunctionsTest.cpp b/velox/functions/prestosql/tests/StringFunctionsTest.cpp index 167a09d49a5f..3df9a96a00cb 100644 --- a/velox/functions/prestosql/tests/StringFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/StringFunctionsTest.cpp @@ -1887,10 +1887,6 @@ TEST_F(StringFunctionsTest, hammingDistance) { "The quick green dog jumps over the grey pot"), 10); - EXPECT_EQ(hammingDistance(std::nullopt, std::nullopt), std::nullopt); - EXPECT_EQ(hammingDistance("hello", std::nullopt), std::nullopt); - EXPECT_EQ(hammingDistance(std::nullopt, "world"), std::nullopt); - EXPECT_EQ(hammingDistance("hello na\u00EFve world", "hello naive world"), 1); EXPECT_EQ( hammingDistance( @@ -1932,4 +1928,4 @@ TEST_F(StringFunctionsTest, hammingDistance) { hammingDistance( "\u4FE1\u5FF5,\u7231,\u5E0C\u671B", "\u4FE1\u5FF5\u5E0C\u671B"), "The input strings to hamming_distance function must have the same length"); -} \ No newline at end of file +}