Skip to content

Commit

Permalink
Add word_stem Presto function (#9363)
Browse files Browse the repository at this point in the history
Summary:
Add snowball libstemmer v2.2.0 as one of the dependencies.
And use it to implement the word_stem() as a scalar UDF.

When using the libstemmer API, each language creates an `sb_stemmer`
instance which consumes 114 bytes, including the default 10 bytes for the output stem.
It uses the `realloc` to increase the memory block for the output stem if needed.

Fixes #8487

Pull Request resolved: #9363

Reviewed By: amitkdutta

Differential Revision: D56059511

Pulled By: pedroerp

fbshipit-source-id: b3a66956c3809e3f3dadfc8cc7b397b7116996d5
  • Loading branch information
yhwang authored and facebook-github-bot committed Apr 27, 2024
1 parent 6bcf11e commit 4797041
Show file tree
Hide file tree
Showing 10 changed files with 343 additions and 1 deletion.
1 change: 1 addition & 0 deletions CMake/resolve_dependency_modules/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ by Velox. See details on bundling below.
| wangle | v2024.04.01.00 | No |
| mvfst | v2024.04.01.00 | No |
| fbthrift | v2024.04.01.00 | No |
| libstemmer | 2.2.0 | Yes |
| DuckDB (testing) | 0.8.1 | Yes |
| cpr (testing) | 1.10.15 | Yes |

Expand Down
24 changes: 24 additions & 0 deletions CMake/resolve_dependency_modules/libstemmer/Makefile.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@
EXEEXT=.exe
endif
CFLAGS=-O2
-CPPFLAGS=-Iinclude
+CPPFLAGS=-Iinclude -fPIC
all: libstemmer.a stemwords$(EXEEXT)
libstemmer.a: $(snowball_sources:.c=.o)
$(AR) -cru $@ $^
57 changes: 57 additions & 0 deletions CMake/resolve_dependency_modules/stemmer.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include_guard(GLOBAL)

set(VELOX_STEMMER_VERSION 2.2.0)
set(VELOX_STEMMER_BUILD_SHA256_CHECKSUM
b941d9fe9cf36b4e2f8d3873cd4d8b8775bd94867a1df8d8c001bb8b688377c3)
set(VELOX_STEMMER_SOURCE_URL
"https://snowballstem.org/dist/libstemmer_c-${VELOX_STEMMER_VERSION}.tar.gz"
)

resolve_dependency_url(STEMMER)

message(STATUS "Building stemmer from source")
find_program(MAKE_PROGRAM make REQUIRED)

set(STEMMER_PREFIX "${CMAKE_BINARY_DIR}/_deps/libstemmer")
set(STEMMER_INCLUDE_PATH ${STEMMER_PREFIX}/src/libstemmer/include)

# We can not use FetchContent as libstemmer does not use cmake
ExternalProject_Add(
libstemmer
PREFIX ${STEMMER_PREFIX}
SOURCE_DIR ${STEMMER_PREFIX}/src/libstemmer
URL ${VELOX_STEMMER_SOURCE_URL}
URL_HASH ${VELOX_STEMMER_BUILD_SHA256_CHECKSUM}
BUILD_IN_SOURCE TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND ${MAKE_PROGRAM}
INSTALL_COMMAND ""
PATCH_COMMAND git apply ${CMAKE_CURRENT_LIST_DIR}/libstemmer/Makefile.patch
BUILD_BYPRODUCTS
${STEMMER_PREFIX}/src/libstemmer/${CMAKE_STATIC_LIBRARY_PREFIX}stemmer${CMAKE_STATIC_LIBRARY_SUFFIX}
)

add_library(stemmer STATIC IMPORTED)
add_library(stemmer::stemmer ALIAS stemmer)
file(MAKE_DIRECTORY ${STEMMER_INCLUDE_PATH})
set_target_properties(
stemmer
PROPERTIES
IMPORTED_LOCATION
${STEMMER_PREFIX}/src/libstemmer/${CMAKE_STATIC_LIBRARY_PREFIX}stemmer${CMAKE_STATIC_LIBRARY_SUFFIX}
INTERFACE_INCLUDE_DIRECTORIES ${STEMMER_INCLUDE_PATH})

add_dependencies(stemmer libstemmer)
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -548,6 +548,9 @@ endif()
set_source(xsimd)
resolve_dependency(xsimd 10.0.0)

set(stemmer_SOURCE BUNDLED)
resolve_dependency(stemmer)

if(VELOX_BUILD_TESTING)
set(BUILD_TESTING ON)
include(CTest) # include after project() but before add_subdirectory()
Expand Down
37 changes: 37 additions & 0 deletions velox/docs/functions/presto/string.rst
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,43 @@ String Functions

Converts ``string`` to uppercase.

.. function:: word_stem(word) -> varchar

Returns the stem of ``word`` in the English language. If the ``word`` is not an English word,
the ``word`` in lowercase is returned.

.. function:: word_stem(word, lang) -> varchar

Returns the stem of ``word`` in the ``lang`` language. This function supports the following languages:

=========== ================
lang Language
=========== ================
``ca`` ``Catalan``
``da`` ``Danish``
``de`` ``German``
``en`` ``English``
``es`` ``Spanish``
``eu`` ``Basque``
``fi`` ``Finnish``
``fr`` ``French``
``hu`` ``Hungarian``
``hy`` ``Armenian``
``ir`` ``Irish``
``it`` ``Italian``
``lt`` ``Lithuanian``
``nl`` ``Dutch``
``no`` ``Norwegian``
``pt`` ``Portuguese``
``ro`` ``Romanian``
``ru`` ``Russian``
``sv`` ``Swedish``
``tr`` ``Turkish``
=========== ================

If the specified ``lang`` is not supported, this function throws a user error.


Unicode Functions
-----------------

Expand Down
3 changes: 2 additions & 1 deletion velox/functions/prestosql/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ target_link_libraries(
velox_type_tz
velox_presto_types
velox_functions_util
Folly::folly)
Folly::folly
stemmer::stemmer)

set_property(TARGET velox_functions_prestosql_impl PROPERTY JOB_POOL_COMPILE
high_memory_pool)
Expand Down
132 changes: 132 additions & 0 deletions velox/functions/prestosql/WordStem.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <folly/container/F14Map.h>
#include <libstemmer.h> // @manual

#include "velox/functions/Udf.h"
#include "velox/functions/lib/string/StringImpl.h"

namespace facebook::velox::functions {

namespace detail {
// Wrap the sbstemmer library and use its sb_stemmer_stem
// to get word stem.
class Stemmer {
public:
Stemmer(sb_stemmer* stemmer) : sbStemmer_(stemmer) {
VELOX_CHECK_NOT_NULL(stemmer);
}

~Stemmer() {
sb_stemmer_delete(sbStemmer_);
}

// Get the word stem or NULL if out of memory.
const char* stem(const std::string& input) {
return (const char*)(sb_stemmer_stem(
sbStemmer_,
reinterpret_cast<unsigned char const*>(input.c_str()),
input.length()));
}

private:
sb_stemmer* sbStemmer_;
};
} // namespace detail

/// word_stem function
/// word_stem(word) -> varchar
/// return the stem of the word in the English language
/// word_stem(word, lang) -> varchar
/// return the stem of the word in the specificed language
///
/// Use the snowball stemmer library to calculate the stem.
/// https://snowballstem.org
/// The website provides Java implementation which is used in Presto as well
/// as C implementation. Therefore, both Presto and Prestimissio
/// would have the same word stem results.
template <typename TExec>
struct WordStemFunction {
VELOX_DEFINE_FUNCTION_TYPES(TExec);

// ASCII input always produces ASCII result.
static constexpr bool is_default_ascii_behavior = true;

FOLLY_ALWAYS_INLINE void call(
out_type<Varchar>& result,
const arg_type<Varchar>& input) {
return doCall<false>(result, input);
}

FOLLY_ALWAYS_INLINE void callAscii(
out_type<Varchar>& result,
const arg_type<Varchar>& input) {
return doCall<true>(result, input);
}

FOLLY_ALWAYS_INLINE void call(
out_type<Varchar>& result,
const arg_type<Varchar>& input,
const arg_type<Varchar>& lang) {
return doCall<false>(result, input, lang);
}

FOLLY_ALWAYS_INLINE void callAscii(
out_type<Varchar>& result,
const arg_type<Varchar>& input,
const arg_type<Varchar>& lang) {
return doCall<true>(result, input, lang);
}

template <bool isAscii>
FOLLY_ALWAYS_INLINE void doCall(
out_type<Varchar>& result,
const arg_type<Varchar>& input,
const std::string& lang = "en") {
auto* stemmer = getStemmer(lang);
VELOX_USER_CHECK_NOT_NULL(
stemmer, "Unsupported stemmer language: {}", lang);

std::string lowerOutput;
stringImpl::lower<isAscii>(lowerOutput, input);
auto* stem = stemmer->stem(lowerOutput);
VELOX_CHECK_NOT_NULL(
stem, "Stemmer library returned a NULL (out-of-memory)")
result = stem;
}

private:
folly::F14FastMap<std::string, std::unique_ptr<detail::Stemmer>> stemmers_;

// Get a detail::Stemmer from the the map using the lang as the key or create
// a new one if it doesn't exist. Return nullptr if the specified lang is not
// supported.
detail::Stemmer* getStemmer(const std::string& lang) {
if (auto found = stemmers_.find(lang); found != stemmers_.end()) {
return found->second.get();
}
// Only support ASCII and UTF-8.
if (auto sbStemmer = sb_stemmer_new(lang.c_str(), "UTF_8")) {
auto* stemmer = new detail::Stemmer(sbStemmer);
stemmers_[lang] = std::unique_ptr<detail::Stemmer>(stemmer);
return stemmer;
}
return nullptr;
}
};
} // namespace facebook::velox::functions
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "velox/functions/prestosql/SplitPart.h"
#include "velox/functions/prestosql/SplitToMap.h"
#include "velox/functions/prestosql/StringFunctions.h"
#include "velox/functions/prestosql/WordStem.h"

namespace facebook::velox::functions {

Expand Down Expand Up @@ -127,5 +128,10 @@ void registerStringFunctions(const std::string& prefix) {
{prefix + "strrpos"});
registerFunction<StrRPosFunction, int64_t, Varchar, Varchar, int64_t>(
{prefix + "strrpos"});

// word_stem function
registerFunction<WordStemFunction, Varchar, Varchar>({prefix + "word_stem"});
registerFunction<WordStemFunction, Varchar, Varchar, Varchar>(
{prefix + "word_stem"});
}
} // namespace facebook::velox::functions
1 change: 1 addition & 0 deletions velox/functions/prestosql/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ add_executable(
URLFunctionsTest.cpp
Utf8Test.cpp
WidthBucketArrayTest.cpp
WordStemTest.cpp
ZipTest.cpp
ZipWithTest.cpp)

Expand Down
80 changes: 80 additions & 0 deletions velox/functions/prestosql/tests/WordStemTest.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <optional>
#include <string>

#include "velox/common/base/tests/GTestUtils.h"
#include "velox/functions/prestosql/tests/utils/FunctionBaseTest.h"

using namespace facebook::velox::functions::test;

namespace facebook::velox::functions {
namespace {
class WordStemTest : public FunctionBaseTest {
protected:
std::string wordStem(const std::string& word, const std::string& lang) {
return evaluateOnce<std::string>(
"word_stem(c0, c1)", std::optional(word), std::optional(lang))
.value();
}

std::string wordStem(const std::string& word) {
return evaluateOnce<std::string>("word_stem(c0)", std::optional(word))
.value();
}
};

/// Borrow test cases from Presto Java:
/// https://github.com/prestodb/presto/blob/master/presto-main/src/test/java/com/facebook/presto/operator/scalar/TestWordStemFunction.java
TEST_F(WordStemTest, asciiWord) {
EXPECT_EQ(wordStem(""), "");
EXPECT_EQ(wordStem("x"), "x");
EXPECT_EQ(wordStem("abc"), "abc");
EXPECT_EQ(wordStem("generally"), "general");
EXPECT_EQ(wordStem("useful"), "use");
EXPECT_EQ(wordStem("runs"), "run");
EXPECT_EQ(wordStem("run"), "run");
EXPECT_EQ(wordStem("authorized", "en"), "author");
EXPECT_EQ(wordStem("accessories", "en"), "accessori");
EXPECT_EQ(wordStem("intensifying", "en"), "intensifi");
EXPECT_EQ(wordStem("resentment", "en"), "resent");
EXPECT_EQ(wordStem("faithfulness", "en"), "faith");
EXPECT_EQ(wordStem("continuerait", "fr"), "continu");
EXPECT_EQ(wordStem("torpedearon", "es"), "torped");
EXPECT_EQ(wordStem("quilomtricos", "pt"), "quilomtr");
EXPECT_EQ(wordStem("pronunziare", "it"), "pronunz");
EXPECT_EQ(wordStem("auferstnde", "de"), "auferstnd");
}

TEST_F(WordStemTest, invalidLang) {
VELOX_ASSERT_THROW(
wordStem("hello", "xx"), "Unsupported stemmer language: xx");
}

TEST_F(WordStemTest, unicodeWord) {
EXPECT_EQ(
wordStem(
"\u004b\u0069\u0074\u0061\u0062\u0131\u006d\u0131\u007a\u0064\u0131",
"tr"),
"kitap");
EXPECT_EQ(
wordStem("\u0432\u0435\u0441\u0435\u043d\u043d\u0438\u0439", "ru"),
"\u0432\u0435\u0441\u0435\u043d");
}

} // namespace
} // namespace facebook::velox::functions

0 comments on commit 4797041

Please sign in to comment.