Add wordnet based lemmatizer (#1991)

### What problem does this PR solve? Remove libwn, because it could not return same results as NLTK.WordNetLemmatizer Issue link:#1973 ### Type of change - [X] New Feature (non-breaking change which adds functionality)
infiniflow · Oct 8, 2024 · 8f4bcdc · 8f4bcdc
1 parent db7d7f2
commit 8f4bcdc
Show file tree

Hide file tree

Showing 16 changed files with 292 additions and 5,159 deletions.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -244,7 +244,7 @@ target_sources(infinity_core
         ${network_cppm}
 )
 
-add_dependencies(infinity_core thrift thriftnb parquet_static snappy re2 wn)
+add_dependencies(infinity_core thrift thriftnb parquet_static snappy re2)
 target_include_directories(infinity_core PUBLIC ${Python3_INCLUDE_DIRS})
 target_include_directories(infinity_core PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}")
 target_include_directories(infinity_core PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/parser")
@@ -280,7 +280,6 @@ target_include_directories(infinity_core PUBLIC "${CMAKE_SOURCE_DIR}/third_party
 target_include_directories(infinity_core PUBLIC "${CMAKE_SOURCE_DIR}/third_party/curl/include")
 target_include_directories(infinity_core PUBLIC "${CMAKE_SOURCE_DIR}/third_party/darts/")
 target_include_directories(infinity_core PUBLIC "${CMAKE_SOURCE_DIR}/third_party/re2")
-target_include_directories(infinity_core PUBLIC "${CMAKE_SOURCE_DIR}/third_party/wordnet")
 
 if (NOT SUPPORT_FMA EQUAL 0)
     message(FATAL_ERROR "This project requires the processor support fused multiply-add (FMA) instructions.")
@@ -350,7 +349,6 @@ target_link_libraries(infinity
         ssl.a
         crypto.a
         re2.a
-        wn.a
 )
 target_link_directories(infinity PUBLIC "${CMAKE_BINARY_DIR}/lib")
 target_link_directories(infinity PUBLIC "${CMAKE_BINARY_DIR}/third_party/oatpp/src/")
@@ -362,7 +360,6 @@ target_link_directories(infinity PUBLIC "${CMAKE_BINARY_DIR}/third_party/curlpp/
 target_link_directories(infinity PUBLIC "${CMAKE_BINARY_DIR}/third_party/curl/")
 target_link_directories(infinity PUBLIC "${CMAKE_BINARY_DIR}/third_party/")
 target_link_directories(infinity PUBLIC "${CMAKE_BINARY_DIR}/third_party/re2/")
-target_link_directories(infinity PUBLIC "${CMAKE_BINARY_DIR}/third_party/wordnet/")
 target_link_directories(infinity PUBLIC "/usr/local/openssl30/lib64")
 
 target_include_directories(infinity PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}")
@@ -420,7 +417,6 @@ if (SKBUILD)
             ssl.a
             crypto.a
             re2.a
-            wn.a
     )
 
     # WARN: python modules shall not link to static libstdc++!!!
@@ -436,7 +432,6 @@ if (SKBUILD)
     target_link_directories(embedded_infinity_ext PUBLIC "${CMAKE_BINARY_DIR}/third_party/curlpp/")
     target_link_directories(embedded_infinity_ext PUBLIC "${CMAKE_BINARY_DIR}/third_party/curl/")
     target_link_directories(embedded_infinity_ext PUBLIC "${CMAKE_BINARY_DIR}/third_party/re2/")
-    target_link_directories(embedded_infinity_ext PUBLIC "${CMAKE_BINARY_DIR}/third_party/wordnet/")
     target_link_directories(embedded_infinity_ext PUBLIC "${CMAKE_BINARY_DIR}/third_party/")
     target_link_directories(embedded_infinity_ext PUBLIC "/usr/local/openssl30/lib64")
     nanobind_disable_stack_protector(embedded_infinity_ext)
@@ -542,7 +537,7 @@ add_executable(unit_test
 )
 
 set_target_properties(unit_test PROPERTIES OUTPUT_NAME test_main)
-add_dependencies(unit_test oatpp miniocpp pugixml-static curlpp_static inih libcurl_static re2 wn)
+add_dependencies(unit_test oatpp miniocpp pugixml-static curlpp_static inih libcurl_static re2)
 
 target_link_libraries(unit_test
         gtest
@@ -570,7 +565,6 @@ target_link_libraries(unit_test
         event.a
         miniocpp.a
         re2.a
-        wn.a
         pugixml-static
         curlpp_static
         inih.a
@@ -588,7 +582,6 @@ target_link_directories(unit_test PUBLIC "${CMAKE_BINARY_DIR}/third_party/pugixm
 target_link_directories(unit_test PUBLIC "${CMAKE_BINARY_DIR}/third_party/curlpp/")
 target_link_directories(unit_test PUBLIC "${CMAKE_BINARY_DIR}/third_party/curl/")
 target_link_directories(unit_test PUBLIC "${CMAKE_BINARY_DIR}/third_party/re2/")
-target_link_directories(unit_test PUBLIC "${CMAKE_BINARY_DIR}/third_party/wordnet/")
 target_link_directories(unit_test PUBLIC "${CMAKE_BINARY_DIR}/third_party/")
 target_link_directories(unit_test PUBLIC "/usr/local/openssl30/lib64")
 
@@ -617,7 +610,6 @@ target_include_directories(unit_test PUBLIC "${CMAKE_SOURCE_DIR}/third_party/cur
 target_include_directories(unit_test PUBLIC "${CMAKE_SOURCE_DIR}/third_party/curl/include")
 target_include_directories(unit_test PUBLIC "${CMAKE_SOURCE_DIR}/third_party/darts")
 target_include_directories(unit_test PUBLIC "${CMAKE_SOURCE_DIR}/third_party/re2")
-target_include_directories(unit_test PUBLIC "${CMAKE_SOURCE_DIR}/third_party/wordnet")
 
 
 # target_compile_options(unit_test PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mpopcnt>)

diff --git a/src/common/analyzer/lemmatizer.cpp b/src/common/analyzer/lemmatizer.cpp
@@ -0,0 +1,219 @@
+// Copyright(C) 2024 InfiniFlow, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+module;
+
+#include <cassert>
+#include <cstddef>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <unordered_map>
+
+module lemmatizer;
+
+import stl;
+import third_party;
+
+namespace infinity {
+
+static const String ADJ = "a";
+static const String ADJ_SAT = "s";
+static const String ADV = "r";
+static const String NOUN = "n";
+static const String VERB = "v";
+
+Lemmatizer::Lemmatizer(const String &path) : path_(path) {}
+
+Lemmatizer::~Lemmatizer() {}
+
+Status Lemmatizer::Load() {
+    // wninit(path_.c_str());
+
+    file_map_ = {{ADJ, "adj"}, {ADV, "adv"}, {NOUN, "noun"}, {VERB, "verb"}};
+    pos_numbers_ = {{NOUN, 1}, {VERB, 2}, {ADJ, 3}, {ADV, 4}, {ADJ_SAT, 5}};
+    MORPHOLOGICAL_SUBSTITUTIONS = {
+        {NOUN, {{"s", ""}, {"ses", "s"}, {"ves", "f"}, {"xes", "x"}, {"zes", "z"}, {"ches", "ch"}, {"shes", "sh"}, {"men", "man"}, {"ies", "y"}}},
+        {VERB, {{"s", ""}, {"ies", "y"}, {"es", "e"}, {"es", ""}, {"ed", "e"}, {"ed", ""}, {"ing", "e"}, {"ing", ""}}},
+        {ADJ, {{"er", ""}, {"est", ""}, {"er", "e"}, {"est", "e"}}},
+        {ADV, {}},
+        {ADJ_SAT, {{"er", ""}, {"est", ""}, {"er", "e"}, {"est", "e"}}}};
+
+    pos_names_ = [this] {
+        HashMap<int, String> names;
+        for (const auto &pair : pos_numbers_) {
+            names[pair.second] = pair.first;
+        }
+        return names;
+    }();
+
+    POS_LIST = {NOUN, VERB, ADJ, ADV};
+
+    LoadLemmaPosOffsetMap();
+    LoadExceptionMap();
+
+    return Status::OK();
+}
+
+Status Lemmatizer::LoadLemmaPosOffsetMap() {
+    for (const auto &pair : file_map_) {
+        const String &pos = pair.second;
+
+        std::ifstream file(path_ + "/" + "index." + pos);
+        if (!file.is_open()) {
+            return Status::InvalidAnalyzerFile(fmt::format("Failed to load wordnet lemmatizer, index.{}", pos));
+        }
+
+        String line;
+
+        while (std::getline(file, line)) {
+            if (line.empty() || line[0] == ' ') {
+                continue; // Skip comment lines or empty lines
+            }
+
+            std::istringstream stream(line);
+            try {
+                String lemma;
+                String pos;
+                std::getline(stream, lemma, ' ');
+                std::getline(stream, pos, ' ');
+
+                int n_synsets;
+                stream >> n_synsets;
+                assert(n_synsets > 0);
+
+                int n_pointers;
+                stream >> n_pointers;
+                // Ignore pointer symbols
+                for (int i = 0; i < n_pointers; ++i) {
+                    String pointer_symbol;
+                    stream >> pointer_symbol; // Dummy read
+                }
+
+                int n_senses;
+                stream >> n_senses;
+                assert(n_synsets == n_senses);
+
+                // Ignore the number of senses ranked by frequency
+                int n_ranked_senses;
+                stream >> n_ranked_senses;
+
+                // Get synset offsets
+                Vector<int> synset_offsets(n_synsets);
+                for (int i = 0; i < n_synsets; ++i) {
+                    stream >> synset_offsets[i];
+                }
+
+                // Map lemmas and parts of speech to synsets
+                lemma_pos_offset_map_[lemma][pos] = synset_offsets;
+
+                if (pos == ADJ) {
+                    // Duplicate all adjectives indiscriminately
+                    lemma_pos_offset_map_[lemma][ADJ_SAT] = synset_offsets;
+                }
+
+            } catch (const std::exception &) {
+                return Status::InvalidAnalyzerFile("Failed to load wordnet lemmatizer");
+            }
+        }
+    }
+    return Status::OK();
+}
+
+void Lemmatizer::LoadExceptionMap() {
+    for (const auto &pair : file_map_) {
+        const auto &pos = pair.first;
+        const auto &suffix = pair.second;
+
+        std::ifstream file(path_ + "/" + suffix + ".exc");
+        std::cout << "exception file " << suffix + ".exc" << std::endl;
+        exception_map_[pos] = {};
+
+        String line;
+        while (std::getline(file, line)) {
+            std::istringstream stream(line);
+            String term;
+            stream >> term;
+            Vector<String> exceptions{std::istream_iterator<String>{stream}, std::istream_iterator<String>{}};
+            exception_map_[pos][term] = exceptions;
+        }
+    }
+    exception_map_[ADJ_SAT] = exception_map_.at(ADJ);
+}
+
+Vector<String> Lemmatizer::ApplyRules(const Vector<String> &forms, const String &pos) {
+    const auto &substitutions = MORPHOLOGICAL_SUBSTITUTIONS.at(pos);
+    Vector<String> results;
+    for (const auto &form : forms) {
+        for (const auto &[old, new_suffix] : substitutions) {
+            if (form.size() >= old.size() && form.compare(form.size() - old.size(), old.size(), old) == 0) {
+                results.push_back(form.substr(0, form.size() - old.size()) + new_suffix);
+            }
+        }
+    }
+    return results;
+}
+
+Vector<String> Lemmatizer::FilterForms(const Vector<String> &forms, const String &pos) {
+    Vector<String> result;
+    Set<String> seen;
+    for (const auto &form : forms) {
+        // Check if form exists in lemma_pos_offset_map_
+        if (lemma_pos_offset_map_.find(form) != lemma_pos_offset_map_.end()) {
+            if (lemma_pos_offset_map_[form].find(pos) != lemma_pos_offset_map_[form].end()) {
+                if (seen.find(form) == seen.end()) {
+                    result.push_back(form);
+                    seen.insert(form);
+                }
+            }
+        }
+    }
+    return result;
+}
+
+Vector<String> Lemmatizer::Morphy_(const String &form, const String &pos, bool check_exceptions) {
+    const auto &exceptions = exception_map_.at(pos);
+
+    Vector<String> forms;
+
+    if (check_exceptions && exceptions.find(form) != exceptions.end()) {
+        forms = exceptions.at(form);
+    } else {
+        forms = ApplyRules({form}, pos);
+    }
+
+    forms.push_back(form); // Include original form
+    return FilterForms(forms, pos);
+}
+
+String Lemmatizer::Lemmatize(const String &form, const String &pos) {
+    // If pos is specified, create a vector with that pos, otherwise use all POS_LIST
+    Vector<String> parts_of_speech;
+    if (!pos.empty()) {
+        parts_of_speech.push_back(pos);
+    } else {
+        parts_of_speech = POS_LIST;
+    }
+
+    for (const auto &part : parts_of_speech) {
+        auto analyses = Morphy_(form, part);
+        if (!analyses.empty()) {
+            // Return the first successful analysis
+            return analyses[0];
+        }
+    }
+
+    return form;
+}
+
+} // namespace infinity
diff --git a/src/common/analyzer/lemmatizer.cppm b/src/common/analyzer/lemmatizer.cppm
@@ -0,0 +1,56 @@
+// Copyright(C) 2024 InfiniFlow, Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+module;
+
+export module lemmatizer;
+
+import stl;
+import status;
+
+namespace infinity {
+
+export class Lemmatizer {
+public:
+    Lemmatizer(const String &path);
+
+    ~Lemmatizer();
+
+    Status Load();
+
+    String Lemmatize(const String &form, const String &pos = "");
+
+private:
+    Status LoadLemmaPosOffsetMap();
+
+    void LoadExceptionMap();
+
+    Vector<String> Morphy_(const String &form, const String &pos, bool check_exceptions = true);
+
+    Vector<String> ApplyRules(const Vector<String> &forms, const String &pos);
+
+    Vector<String> FilterForms(const Vector<String> &forms, const String &pos);
+
+    String path_;
+
+    HashMap<String, HashMap<String, Vector<int>>> lemma_pos_offset_map_;
+    HashMap<String, HashMap<String, Vector<String>>> exception_map_;
+    HashMap<String, int> pos_numbers_;
+    HashMap<int, String> pos_names_;
+    HashMap<String, String> file_map_;
+    HashMap<String, Vector<Pair<String, String>>> MORPHOLOGICAL_SUBSTITUTIONS;
+    Vector<String> POS_LIST;
+};
+
+} // namespace infinity