Skip to content

Commit

Permalink
address comments
Browse files Browse the repository at this point in the history
  • Loading branch information
willsfeng committed Mar 27, 2024
1 parent 77d7cc5 commit 9beb83e
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 32 deletions.
67 changes: 40 additions & 27 deletions velox/functions/prestosql/StringFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -347,48 +347,61 @@ template <typename T>
struct HammingDistanceFunction {
VELOX_DEFINE_FUNCTION_TYPES(T);

template <typename TCodePoint>
void doCall(
void call(
out_type<int64_t>& result,
const TCodePoint* leftCodePoints,
const TCodePoint* rightCodePoints,
size_t leftCodePointsSize,
size_t rightCodePointsSize) {
VELOX_USER_CHECK(
leftCodePointsSize == rightCodePointsSize,
"The input strings to hamming_distance function must have the same length");
const arg_type<Varchar>& left,
const arg_type<Varchar>& right) {
int64_t leftLength = left.size();
int64_t rightLength = right.size();

int64_t distance = 0;
for (int i = 0; i < leftCodePointsSize; i++) {
if (leftCodePoints[i] != rightCodePoints[i]) {
int64_t leftPosition = 0;
int64_t rightPosition = 0;
while (leftPosition < leftLength && rightPosition < rightLength) {
int leftSize = 0;
int rightSize = 0;
auto codePointLeft = utf8proc_codepoint(
left.data() + leftPosition, left.data() + leftLength, leftSize);
auto codePointRight = utf8proc_codepoint(
right.data() + rightPosition, right.data() + rightLength, rightSize);

// if both code points are invalid, we do not care if they are equal
// the following code treats them as equal if they happen to be of the
// same length
leftPosition += codePointLeft > 0 ? leftSize : -codePointLeft;
rightPosition += codePointRight > 0 ? rightSize : -codePointRight;

if (codePointLeft != codePointRight) {
distance++;
}
}
result = distance;
}
VELOX_USER_CHECK(
leftPosition == leftLength && rightPosition == rightLength,
"The input strings to hamming_distance function must have the same length");

void call(
out_type<int64_t>& result,
const arg_type<Varchar>& left,
const arg_type<Varchar>& right) {
auto leftCodePoints = stringImpl::stringToCodePoints(left);
auto rightCodePoints = stringImpl::stringToCodePoints(right);
doCall<int32_t>(
result,
leftCodePoints.data(),
rightCodePoints.data(),
leftCodePoints.size(),
rightCodePoints.size());
result = distance;
}

void callAscii(
out_type<int64_t>& result,
const arg_type<Varchar>& left,
const arg_type<Varchar>& right) {
int64_t leftLength = left.size();
int64_t rightLength = right.size();
VELOX_USER_CHECK_EQ(
leftLength,
rightLength,
"The input strings to hamming_distance function must have the same length");

auto leftCodePoints = reinterpret_cast<const uint8_t*>(left.data());
auto rightCodePoints = reinterpret_cast<const uint8_t*>(right.data());
doCall<uint8_t>(
result, leftCodePoints, rightCodePoints, left.size(), right.size());
int64_t distance = 0;
for (int i = 0; i < leftLength; i++) {
if (leftCodePoints[i] != rightCodePoints[i]) {
distance++;
}
}
result = distance;
}
};

Expand Down
6 changes: 1 addition & 5 deletions velox/functions/prestosql/tests/StringFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1887,10 +1887,6 @@ TEST_F(StringFunctionsTest, hammingDistance) {
"The quick green dog jumps over the grey pot"),
10);

EXPECT_EQ(hammingDistance(std::nullopt, std::nullopt), std::nullopt);
EXPECT_EQ(hammingDistance("hello", std::nullopt), std::nullopt);
EXPECT_EQ(hammingDistance(std::nullopt, "world"), std::nullopt);

EXPECT_EQ(hammingDistance("hello na\u00EFve world", "hello naive world"), 1);
EXPECT_EQ(
hammingDistance(
Expand Down Expand Up @@ -1932,4 +1928,4 @@ TEST_F(StringFunctionsTest, hammingDistance) {
hammingDistance(
"\u4FE1\u5FF5,\u7231,\u5E0C\u671B", "\u4FE1\u5FF5\u5E0C\u671B"),
"The input strings to hamming_distance function must have the same length");
}
}

0 comments on commit 9beb83e

Please sign in to comment.