Skip to content

Commit

Permalink
add hamming_distance presto function
Browse files Browse the repository at this point in the history
  • Loading branch information
willsfeng committed Mar 27, 2024
1 parent 6edf10a commit d4d03a5
Show file tree
Hide file tree
Showing 4 changed files with 132 additions and 0 deletions.
6 changes: 6 additions & 0 deletions velox/docs/functions/presto/string.rst
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,12 @@ String Functions
empty string. When `replace` is an empty string invalid characters are
removed.

.. function:: hamming_distance(string1, string2) -> bigint

Returns the Hamming distance of ``string1`` and ``string2``,
i.e. the number of positions at which the corresponding characters are different.
Note that the two strings must have the same length.

.. function:: length(string) -> bigint

Returns the length of ``string`` in characters.
Expand Down
53 changes: 53 additions & 0 deletions velox/functions/prestosql/StringFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,59 @@ struct StrLPosFunction : public StrPosFunctionBase<T, true> {};
template <typename T>
struct StrRPosFunction : public StrPosFunctionBase<T, false> {};

/// hamming_distance(string, string) -> bigint
/// Computes the hamming distance between two strings.
template <typename T>
struct HammingDistanceFunction {
VELOX_DEFINE_FUNCTION_TYPES(T);

template <typename TCodePoint>
void doCall(
out_type<int64_t>& result,
const TCodePoint* leftCodePoints,
const TCodePoint* rightCodePoints,
size_t leftCodePointsSize,
size_t rightCodePointsSize) {

VELOX_USER_CHECK(
leftCodePointsSize == rightCodePointsSize,
"The input strings to hamming_distance function must have the same length");

int64_t distance = 0;
for (int i = 0; i < leftCodePointsSize; i++) {
if (leftCodePoints[i] != rightCodePoints[i]) {
distance++;
}
}
result=distance;
}

void call(
out_type<int64_t>& result,
const arg_type<Varchar>& left,
const arg_type<Varchar>& right) {
auto leftCodePoints = stringImpl::stringToCodePoints(left);
auto rightCodePoints = stringImpl::stringToCodePoints(right);
doCall<int32_t>(
result,
leftCodePoints.data(),
rightCodePoints.data(),
leftCodePoints.size(),
rightCodePoints.size());
}

void callAscii(
out_type<int64_t>& result,
const arg_type<Varchar>& left,
const arg_type<Varchar>& right) {
auto leftCodePoints = reinterpret_cast<const uint8_t*>(left.data());
auto rightCodePoints = reinterpret_cast<const uint8_t*>(right.data());
doCall<uint8_t>(
result, leftCodePoints, rightCodePoints, left.size(), right.size());
}
};


template <typename T>
struct LevenshteinDistanceFunction {
VELOX_DEFINE_FUNCTION_TYPES(T);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ void registerSimpleFunctions(const std::string& prefix) {
// Register string functions.
registerFunction<ChrFunction, Varchar, int64_t>({prefix + "chr"});
registerFunction<CodePointFunction, int32_t, Varchar>({prefix + "codepoint"});
registerFunction<HammingDistanceFunction, int64_t, Varchar, Varchar>(
{prefix + "hamming_distance"});
registerFunction<LevenshteinDistanceFunction, int64_t, Varchar, Varchar>(
{prefix + "levenshtein_distance"});
registerFunction<LengthFunction, int64_t, Varchar>({prefix + "length"});
Expand Down
71 changes: 71 additions & 0 deletions velox/functions/prestosql/tests/StringFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1862,3 +1862,74 @@ TEST_F(StringFunctionsTest, varbinaryLength) {
auto result = evaluate("length(c0)", makeRowVector({vector}));
test::assertEqualVectors(expected, result);
}

TEST_F(StringFunctionsTest, hammingDistance) {
const auto hammingDistance = [&](std::optional<std::string> left,
std::optional<std::string> right) {
return evaluateOnce<int64_t>("hamming_distance(c0, c1)", left, right);
};

EXPECT_EQ(hammingDistance("", ""), 0);
EXPECT_EQ(hammingDistance(" ", " "), 0);
EXPECT_EQ(hammingDistance("6", "6"), 0);
EXPECT_EQ(hammingDistance("z", "z"), 0);
EXPECT_EQ(hammingDistance("a", "b"), 1);
EXPECT_EQ(hammingDistance("b", "B"), 1);
EXPECT_EQ(hammingDistance("hello", "hello"), 0);
EXPECT_EQ(hammingDistance("hello", "jello"), 1);
EXPECT_EQ(hammingDistance("like", "hate"), 3);
EXPECT_EQ(hammingDistance("hello", "world"), 4);
EXPECT_EQ(hammingDistance("Customs", "Luptoki"), 4);
EXPECT_EQ(hammingDistance("This is lame", "Why to slam "), 8);
EXPECT_EQ(
hammingDistance(
"The quick brown fox jumps over the lazy dog",
"The quick green dog jumps over the grey pot"),
10);

EXPECT_EQ(hammingDistance(std::nullopt, std::nullopt), std::nullopt);
EXPECT_EQ(hammingDistance("hello", std::nullopt), std::nullopt);
EXPECT_EQ(hammingDistance(std::nullopt, "world"), std::nullopt);

EXPECT_EQ(hammingDistance("hello na\u00EFve world", "hello naive world"), 1);
EXPECT_EQ(
hammingDistance(
"The quick b\u0155\u00F6wn fox jumps over the laz\uFF59 dog",
"The quick br\u006Fwn fox jumps over the la\u1E91y dog"),
4);
EXPECT_EQ(
hammingDistance(
"\u4FE1\u5FF5,\u7231,\u5E0C\u671B",
"\u4FE1\u4EF0,\u7231,\u5E0C\u671B"),
1);
EXPECT_EQ(
hammingDistance(
"\u4F11\u5FF5,\u7231,\u5E0C\u671B",
"\u4FE1\u5FF5,\u7231,\u5E0C\u671B"),
1);
EXPECT_EQ(hammingDistance("\u0001", "\u0001"), 0);
EXPECT_EQ(hammingDistance("\u0001", "\u0002"), 1);

VELOX_ASSERT_THROW(
hammingDistance("\u0000", "\u0001"),
"The input strings to hamming_distance function must have the same length");
VELOX_ASSERT_THROW(
hammingDistance("hello", ""),
"The input strings to hamming_distance function must have the same length");
VELOX_ASSERT_THROW(
hammingDistance("", "hello"),
"The input strings to hamming_distance function must have the same length");
VELOX_ASSERT_THROW(
hammingDistance("hello", "o"),
"The input strings to hamming_distance function must have the same length");
VELOX_ASSERT_THROW(
hammingDistance("h", "hello"),
"The input strings to hamming_distance function must have the same length");
VELOX_ASSERT_THROW(
hammingDistance("hello na\u00EFve world", "hello na:ive world"),
"The input strings to hamming_distance function must have the same length");
VELOX_ASSERT_THROW(
hammingDistance(
"\u4FE1\u5FF5,\u7231,\u5E0C\u671B", "\u4FE1\u5FF5\u5E0C\u671B"),
"The input strings to hamming_distance function must have the same length");
}

0 comments on commit d4d03a5

Please sign in to comment.