From d4d03a51a592e2129be7cb57ecc31ff7ee80c7e3 Mon Sep 17 00:00:00 2001 From: willsfeng Date: Tue, 26 Mar 2024 18:06:27 -0700 Subject: [PATCH] add hamming_distance presto function --- velox/docs/functions/presto/string.rst | 6 ++ velox/functions/prestosql/StringFunctions.h | 53 ++++++++++++++ .../StringFunctionsRegistration.cpp | 2 + .../prestosql/tests/StringFunctionsTest.cpp | 71 +++++++++++++++++++ 4 files changed, 132 insertions(+) diff --git a/velox/docs/functions/presto/string.rst b/velox/docs/functions/presto/string.rst index 9efc237ccf49..2db49c6f6c81 100644 --- a/velox/docs/functions/presto/string.rst +++ b/velox/docs/functions/presto/string.rst @@ -54,6 +54,12 @@ String Functions empty string. When `replace` is an empty string invalid characters are removed. +.. function:: hamming_distance(string1, string2) -> bigint + + Returns the Hamming distance of ``string1`` and ``string2``, + i.e. the number of positions at which the corresponding characters are different. + Note that the two strings must have the same length. + .. function:: length(string) -> bigint Returns the length of ``string`` in characters. diff --git a/velox/functions/prestosql/StringFunctions.h b/velox/functions/prestosql/StringFunctions.h index 391b8996a169..f88643128bc6 100644 --- a/velox/functions/prestosql/StringFunctions.h +++ b/velox/functions/prestosql/StringFunctions.h @@ -341,6 +341,59 @@ struct StrLPosFunction : public StrPosFunctionBase {}; template struct StrRPosFunction : public StrPosFunctionBase {}; +/// hamming_distance(string, string) -> bigint +/// Computes the hamming distance between two strings. +template +struct HammingDistanceFunction { + VELOX_DEFINE_FUNCTION_TYPES(T); + + template + void doCall( + out_type& result, + const TCodePoint* leftCodePoints, + const TCodePoint* rightCodePoints, + size_t leftCodePointsSize, + size_t rightCodePointsSize) { + + VELOX_USER_CHECK( + leftCodePointsSize == rightCodePointsSize, + "The input strings to hamming_distance function must have the same length"); + + int64_t distance = 0; + for (int i = 0; i < leftCodePointsSize; i++) { + if (leftCodePoints[i] != rightCodePoints[i]) { + distance++; + } + } + result=distance; + } + + void call( + out_type& result, + const arg_type& left, + const arg_type& right) { + auto leftCodePoints = stringImpl::stringToCodePoints(left); + auto rightCodePoints = stringImpl::stringToCodePoints(right); + doCall( + result, + leftCodePoints.data(), + rightCodePoints.data(), + leftCodePoints.size(), + rightCodePoints.size()); + } + + void callAscii( + out_type& result, + const arg_type& left, + const arg_type& right) { + auto leftCodePoints = reinterpret_cast(left.data()); + auto rightCodePoints = reinterpret_cast(right.data()); + doCall( + result, leftCodePoints, rightCodePoints, left.size(), right.size()); + } +}; + + template struct LevenshteinDistanceFunction { VELOX_DEFINE_FUNCTION_TYPES(T); diff --git a/velox/functions/prestosql/registration/StringFunctionsRegistration.cpp b/velox/functions/prestosql/registration/StringFunctionsRegistration.cpp index 4b83803d25e2..f1e36049e92d 100644 --- a/velox/functions/prestosql/registration/StringFunctionsRegistration.cpp +++ b/velox/functions/prestosql/registration/StringFunctionsRegistration.cpp @@ -36,6 +36,8 @@ void registerSimpleFunctions(const std::string& prefix) { // Register string functions. registerFunction({prefix + "chr"}); registerFunction({prefix + "codepoint"}); + registerFunction( + {prefix + "hamming_distance"}); registerFunction( {prefix + "levenshtein_distance"}); registerFunction({prefix + "length"}); diff --git a/velox/functions/prestosql/tests/StringFunctionsTest.cpp b/velox/functions/prestosql/tests/StringFunctionsTest.cpp index 27e38aaf9919..167a09d49a5f 100644 --- a/velox/functions/prestosql/tests/StringFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/StringFunctionsTest.cpp @@ -1862,3 +1862,74 @@ TEST_F(StringFunctionsTest, varbinaryLength) { auto result = evaluate("length(c0)", makeRowVector({vector})); test::assertEqualVectors(expected, result); } + +TEST_F(StringFunctionsTest, hammingDistance) { + const auto hammingDistance = [&](std::optional left, + std::optional right) { + return evaluateOnce("hamming_distance(c0, c1)", left, right); + }; + + EXPECT_EQ(hammingDistance("", ""), 0); + EXPECT_EQ(hammingDistance(" ", " "), 0); + EXPECT_EQ(hammingDistance("6", "6"), 0); + EXPECT_EQ(hammingDistance("z", "z"), 0); + EXPECT_EQ(hammingDistance("a", "b"), 1); + EXPECT_EQ(hammingDistance("b", "B"), 1); + EXPECT_EQ(hammingDistance("hello", "hello"), 0); + EXPECT_EQ(hammingDistance("hello", "jello"), 1); + EXPECT_EQ(hammingDistance("like", "hate"), 3); + EXPECT_EQ(hammingDistance("hello", "world"), 4); + EXPECT_EQ(hammingDistance("Customs", "Luptoki"), 4); + EXPECT_EQ(hammingDistance("This is lame", "Why to slam "), 8); + EXPECT_EQ( + hammingDistance( + "The quick brown fox jumps over the lazy dog", + "The quick green dog jumps over the grey pot"), + 10); + + EXPECT_EQ(hammingDistance(std::nullopt, std::nullopt), std::nullopt); + EXPECT_EQ(hammingDistance("hello", std::nullopt), std::nullopt); + EXPECT_EQ(hammingDistance(std::nullopt, "world"), std::nullopt); + + EXPECT_EQ(hammingDistance("hello na\u00EFve world", "hello naive world"), 1); + EXPECT_EQ( + hammingDistance( + "The quick b\u0155\u00F6wn fox jumps over the laz\uFF59 dog", + "The quick br\u006Fwn fox jumps over the la\u1E91y dog"), + 4); + EXPECT_EQ( + hammingDistance( + "\u4FE1\u5FF5,\u7231,\u5E0C\u671B", + "\u4FE1\u4EF0,\u7231,\u5E0C\u671B"), + 1); + EXPECT_EQ( + hammingDistance( + "\u4F11\u5FF5,\u7231,\u5E0C\u671B", + "\u4FE1\u5FF5,\u7231,\u5E0C\u671B"), + 1); + EXPECT_EQ(hammingDistance("\u0001", "\u0001"), 0); + EXPECT_EQ(hammingDistance("\u0001", "\u0002"), 1); + + VELOX_ASSERT_THROW( + hammingDistance("\u0000", "\u0001"), + "The input strings to hamming_distance function must have the same length"); + VELOX_ASSERT_THROW( + hammingDistance("hello", ""), + "The input strings to hamming_distance function must have the same length"); + VELOX_ASSERT_THROW( + hammingDistance("", "hello"), + "The input strings to hamming_distance function must have the same length"); + VELOX_ASSERT_THROW( + hammingDistance("hello", "o"), + "The input strings to hamming_distance function must have the same length"); + VELOX_ASSERT_THROW( + hammingDistance("h", "hello"), + "The input strings to hamming_distance function must have the same length"); + VELOX_ASSERT_THROW( + hammingDistance("hello na\u00EFve world", "hello na:ive world"), + "The input strings to hamming_distance function must have the same length"); + VELOX_ASSERT_THROW( + hammingDistance( + "\u4FE1\u5FF5,\u7231,\u5E0C\u671B", "\u4FE1\u5FF5\u5E0C\u671B"), + "The input strings to hamming_distance function must have the same length"); +} \ No newline at end of file