Skip to content

Commit

Permalink
Add normalize Presto scalar function (facebookincubator#8590)
Browse files Browse the repository at this point in the history
Summary:
Add normalize() Presto scalar function

Resolves : prestodb/presto#20224

Reference : https://github.com/prestodb/presto/blob/master/presto-main/src/main/java/com/facebook/presto/operator/scalar/StringFunctions.java#L833

Pull Request resolved: facebookincubator#8590

Reviewed By: bikramSingh91

Differential Revision: D58384754

Pulled By: kevinwilfong

fbshipit-source-id: 2ed7a0e7311c3f14bfdba2d3784aee50805fc6ce
  • Loading branch information
pdabre12 authored and facebook-github-bot committed Jun 14, 2024
1 parent 973c334 commit 4dbe29c
Show file tree
Hide file tree
Showing 5 changed files with 146 additions and 2 deletions.
26 changes: 25 additions & 1 deletion velox/docs/functions/presto/string.rst
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,30 @@ String Functions
Unicode Functions
-----------------

.. function:: normalize(string) -> varchar

Transforms ``string`` with NFC normalization form.

.. function:: normalize(string, form) -> varchar

Reference: https://unicode.org/reports/tr15/#Norm_Forms
Transforms ``string`` with the specified normalization form.
``form`` must be be one of the following keywords:

======== ===========
Form Description
======== ===========
``NFD`` Canonical Decomposition
``NFC`` Canonical Decomposition, followed by Canonical Composition
``NFKD`` Compatibility Decomposition
``NFKC`` Compatibility Decomposition, followed by Canonical Composition
======== ===========

.. note::

This SQL-standard function has special syntax and requires
specifying ``form`` as a keyword, not as a string.

.. function:: to_utf8(string) -> varbinary

Encodes ``string`` into a UTF-8 varbinary representation.
Encodes ``string`` into a UTF-8 varbinary representation.
2 changes: 1 addition & 1 deletion velox/external/utf8proc/utf8procImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -635,7 +635,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
}
decomp_result = utf8proc_decompose_char(
uc,
buffer + wpos,
buffer ? buffer + wpos : buffer,
(bufsize > wpos) ? (bufsize - wpos) : 0,
options,
&boundclass);
Expand Down
62 changes: 62 additions & 0 deletions velox/functions/prestosql/StringFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -494,4 +494,66 @@ struct LevenshteinDistanceFunction {
}
};

template <typename T>
struct NormalizeFunction {
VELOX_DEFINE_FUNCTION_TYPES(T);

// Map for holding normalization form options
const static inline std::unordered_map<std::string, utf8proc_int16_t>
normalizationOptions{
{"NFC", (UTF8PROC_STABLE | UTF8PROC_COMPOSE)},
{"NFD", (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE)},
{"NFKC", (UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT)},
{"NFKD", (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT)}};

FOLLY_ALWAYS_INLINE void initialize(
const std::vector<TypePtr>& /*inputTypes*/,
const core::QueryConfig& /*config*/,
const arg_type<Varchar>* /*string*/,
const arg_type<Varchar>* form) {
VELOX_USER_CHECK_NOT_NULL(form);
VELOX_USER_CHECK_NE(
normalizationOptions.count(*form),
0,
"Normalization form must be one of [NFD, NFC, NFKD, NFKC]");
}

FOLLY_ALWAYS_INLINE void call(
out_type<Varchar>& result,
const arg_type<Varchar>& string) {
doCall(result, string, "NFC");
}

FOLLY_ALWAYS_INLINE void call(
out_type<Varchar>& result,
const arg_type<Varchar>& string,
const arg_type<Varchar>& form) {
doCall(result, string, form);
}

// Note: This function newly allocates output using malloc so it should be
// free'd at the end.
FOLLY_ALWAYS_INLINE void doCall(
out_type<Varchar>& result,
const arg_type<Varchar>& string,
const arg_type<Varchar>& form) {
utf8proc_uint8_t* output = nullptr;
auto outputLength = utf8proc_map(
(utf8proc_uint8_t*)string.data(),
string.size(),
&output,
normalizationOptions.at(form));
if (outputLength < 0) {
result = string;
} else {
result.resize(outputLength);
if (result.data()) {
std::memcpy(
result.data(), reinterpret_cast<const char*>(output), outputLength);
}
}
free(output);
}
};

} // namespace facebook::velox::functions
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,10 @@ void registerStringFunctions(const std::string& prefix) {
registerFunction<StrRPosFunction, int64_t, Varchar, Varchar, int64_t>(
{prefix + "strrpos"});

registerFunction<NormalizeFunction, Varchar, Varchar>({prefix + "normalize"});
registerFunction<NormalizeFunction, Varchar, Varchar, Varchar>(
{prefix + "normalize"});

// word_stem function
registerFunction<WordStemFunction, Varchar, Varchar>({prefix + "word_stem"});
registerFunction<WordStemFunction, Varchar, Varchar, Varchar>(
Expand Down
54 changes: 54 additions & 0 deletions velox/functions/prestosql/tests/StringFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1947,3 +1947,57 @@ TEST_F(StringFunctionsTest, hammingDistance) {
hammingDistance("\xFF\x82\xFF", "\xF0\x82"),
"The input strings to hamming_distance function must have the same length");
}

TEST_F(StringFunctionsTest, normalize) {
const auto normalizeWithoutForm = [&](std::optional<std::string> string) {
return evaluateOnce<std::string>("normalize(c0)", string);
};

const auto normalizeWithForm = [&](std::optional<std::string> string,
const std::string& form) {
return evaluateOnce<std::string>(
fmt::format("normalize(c0, '{}')", form), string);
};

EXPECT_EQ(normalizeWithoutForm(std::nullopt), std::nullopt);
EXPECT_EQ(normalizeWithoutForm(""), "");
EXPECT_EQ(normalizeWithoutForm("sch\u00f6n"), "sch\u00f6n");
EXPECT_EQ(normalizeWithForm(std::nullopt, "NFD"), std::nullopt);
EXPECT_EQ(normalizeWithForm("", "NFKC"), "");
EXPECT_EQ(
normalizeWithForm(
(normalizeWithForm("sch\u00f6n", "NFD"), "scho\u0308n"), "NFC"),
"sch\u00f6n");
EXPECT_EQ(
normalizeWithForm(
(normalizeWithForm("sch\u00f6n", "NFKD"), "scho\u0308n"), "NFKC"),
"sch\u00f6n");
EXPECT_EQ(
normalizeWithForm("Hello world from Velox!!", "NFKC"),
"Hello world from Velox!!");

std::string testStringOne =
"\u3231\u3327\u3326\u2162\u3231\u3327\u3326\u2162\u3231\u3327\u3326\u2162";
std::string testStringTwo =
"(\u682a)\u30c8\u30f3\u30c9\u30ebIII(\u682a)\u30c8\u30f3\u30c9\u30ebIII(\u682a)\u30c8\u30f3\u30c9\u30ebIII";
EXPECT_EQ(normalizeWithForm(testStringOne, "NFKC"), testStringTwo);
EXPECT_EQ(
normalizeWithForm((normalizeWithForm(testStringTwo, "NFC")), "NFKC"),
testStringTwo);

std::string testStringThree =
"\uff8a\uff9d\uff76\uff78\uff76\uff85\uff8a\uff9d\uff76\uff78\uff76\uff85\uff8a\uff9d\uff76\uff78\uff76\uff85\uff8a\uff9d\uff76\uff78\uff76\uff85";
std::string testStringFour =
"\u30cf\u30f3\u30ab\u30af\u30ab\u30ca\u30cf\u30f3\u30ab\u30af\u30ab\u30ca\u30cf\u30f3\u30ab\u30af\u30ab\u30ca\u30cf\u30f3\u30ab\u30af\u30ab\u30ca";
EXPECT_EQ(normalizeWithForm(testStringThree, "NFKC"), testStringFour);
EXPECT_EQ(
normalizeWithForm((normalizeWithForm(testStringFour, "NFD")), "NFKC"),
testStringFour);

// Invalid UTF-8 string
std::string inValidTestString = "\xEF\xBE\x8";
EXPECT_EQ(normalizeWithForm(inValidTestString, "NFKC"), inValidTestString);
VELOX_ASSERT_THROW(
normalizeWithForm("sch\u00f6n", "NFKE"),
"Normalization form must be one of [NFD, NFC, NFKD, NFKC]");
}

0 comments on commit 4dbe29c

Please sign in to comment.