Skip to content

Commit

Permalink
Add wordnet based lemmatizer (#1991)
Browse files Browse the repository at this point in the history
### What problem does this PR solve?

Remove libwn, because it could not return same results as
NLTK.WordNetLemmatizer

Issue link:#1973

### Type of change

- [X] New Feature (non-breaking change which adds functionality)
  • Loading branch information
yingfeng authored Oct 8, 2024
1 parent db7d7f2 commit 8f4bcdc
Show file tree
Hide file tree
Showing 16 changed files with 292 additions and 5,159 deletions.
12 changes: 2 additions & 10 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ target_sources(infinity_core
${network_cppm}
)

add_dependencies(infinity_core thrift thriftnb parquet_static snappy re2 wn)
add_dependencies(infinity_core thrift thriftnb parquet_static snappy re2)
target_include_directories(infinity_core PUBLIC ${Python3_INCLUDE_DIRS})
target_include_directories(infinity_core PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}")
target_include_directories(infinity_core PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/parser")
Expand Down Expand Up @@ -280,7 +280,6 @@ target_include_directories(infinity_core PUBLIC "${CMAKE_SOURCE_DIR}/third_party
target_include_directories(infinity_core PUBLIC "${CMAKE_SOURCE_DIR}/third_party/curl/include")
target_include_directories(infinity_core PUBLIC "${CMAKE_SOURCE_DIR}/third_party/darts/")
target_include_directories(infinity_core PUBLIC "${CMAKE_SOURCE_DIR}/third_party/re2")
target_include_directories(infinity_core PUBLIC "${CMAKE_SOURCE_DIR}/third_party/wordnet")

if (NOT SUPPORT_FMA EQUAL 0)
message(FATAL_ERROR "This project requires the processor support fused multiply-add (FMA) instructions.")
Expand Down Expand Up @@ -350,7 +349,6 @@ target_link_libraries(infinity
ssl.a
crypto.a
re2.a
wn.a
)
target_link_directories(infinity PUBLIC "${CMAKE_BINARY_DIR}/lib")
target_link_directories(infinity PUBLIC "${CMAKE_BINARY_DIR}/third_party/oatpp/src/")
Expand All @@ -362,7 +360,6 @@ target_link_directories(infinity PUBLIC "${CMAKE_BINARY_DIR}/third_party/curlpp/
target_link_directories(infinity PUBLIC "${CMAKE_BINARY_DIR}/third_party/curl/")
target_link_directories(infinity PUBLIC "${CMAKE_BINARY_DIR}/third_party/")
target_link_directories(infinity PUBLIC "${CMAKE_BINARY_DIR}/third_party/re2/")
target_link_directories(infinity PUBLIC "${CMAKE_BINARY_DIR}/third_party/wordnet/")
target_link_directories(infinity PUBLIC "/usr/local/openssl30/lib64")

target_include_directories(infinity PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}")
Expand Down Expand Up @@ -420,7 +417,6 @@ if (SKBUILD)
ssl.a
crypto.a
re2.a
wn.a
)

# WARN: python modules shall not link to static libstdc++!!!
Expand All @@ -436,7 +432,6 @@ if (SKBUILD)
target_link_directories(embedded_infinity_ext PUBLIC "${CMAKE_BINARY_DIR}/third_party/curlpp/")
target_link_directories(embedded_infinity_ext PUBLIC "${CMAKE_BINARY_DIR}/third_party/curl/")
target_link_directories(embedded_infinity_ext PUBLIC "${CMAKE_BINARY_DIR}/third_party/re2/")
target_link_directories(embedded_infinity_ext PUBLIC "${CMAKE_BINARY_DIR}/third_party/wordnet/")
target_link_directories(embedded_infinity_ext PUBLIC "${CMAKE_BINARY_DIR}/third_party/")
target_link_directories(embedded_infinity_ext PUBLIC "/usr/local/openssl30/lib64")
nanobind_disable_stack_protector(embedded_infinity_ext)
Expand Down Expand Up @@ -542,7 +537,7 @@ add_executable(unit_test
)

set_target_properties(unit_test PROPERTIES OUTPUT_NAME test_main)
add_dependencies(unit_test oatpp miniocpp pugixml-static curlpp_static inih libcurl_static re2 wn)
add_dependencies(unit_test oatpp miniocpp pugixml-static curlpp_static inih libcurl_static re2)

target_link_libraries(unit_test
gtest
Expand Down Expand Up @@ -570,7 +565,6 @@ target_link_libraries(unit_test
event.a
miniocpp.a
re2.a
wn.a
pugixml-static
curlpp_static
inih.a
Expand All @@ -588,7 +582,6 @@ target_link_directories(unit_test PUBLIC "${CMAKE_BINARY_DIR}/third_party/pugixm
target_link_directories(unit_test PUBLIC "${CMAKE_BINARY_DIR}/third_party/curlpp/")
target_link_directories(unit_test PUBLIC "${CMAKE_BINARY_DIR}/third_party/curl/")
target_link_directories(unit_test PUBLIC "${CMAKE_BINARY_DIR}/third_party/re2/")
target_link_directories(unit_test PUBLIC "${CMAKE_BINARY_DIR}/third_party/wordnet/")
target_link_directories(unit_test PUBLIC "${CMAKE_BINARY_DIR}/third_party/")
target_link_directories(unit_test PUBLIC "/usr/local/openssl30/lib64")

Expand Down Expand Up @@ -617,7 +610,6 @@ target_include_directories(unit_test PUBLIC "${CMAKE_SOURCE_DIR}/third_party/cur
target_include_directories(unit_test PUBLIC "${CMAKE_SOURCE_DIR}/third_party/curl/include")
target_include_directories(unit_test PUBLIC "${CMAKE_SOURCE_DIR}/third_party/darts")
target_include_directories(unit_test PUBLIC "${CMAKE_SOURCE_DIR}/third_party/re2")
target_include_directories(unit_test PUBLIC "${CMAKE_SOURCE_DIR}/third_party/wordnet")


# target_compile_options(unit_test PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mpopcnt>)
Expand Down
219 changes: 219 additions & 0 deletions src/common/analyzer/lemmatizer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
// Copyright(C) 2024 InfiniFlow, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
module;

#include <cassert>
#include <cstddef>
#include <fstream>
#include <iostream>
#include <sstream>
#include <unordered_map>

module lemmatizer;

import stl;
import third_party;

namespace infinity {

static const String ADJ = "a";
static const String ADJ_SAT = "s";
static const String ADV = "r";
static const String NOUN = "n";
static const String VERB = "v";

Lemmatizer::Lemmatizer(const String &path) : path_(path) {}

Lemmatizer::~Lemmatizer() {}

Status Lemmatizer::Load() {
// wninit(path_.c_str());

file_map_ = {{ADJ, "adj"}, {ADV, "adv"}, {NOUN, "noun"}, {VERB, "verb"}};
pos_numbers_ = {{NOUN, 1}, {VERB, 2}, {ADJ, 3}, {ADV, 4}, {ADJ_SAT, 5}};
MORPHOLOGICAL_SUBSTITUTIONS = {
{NOUN, {{"s", ""}, {"ses", "s"}, {"ves", "f"}, {"xes", "x"}, {"zes", "z"}, {"ches", "ch"}, {"shes", "sh"}, {"men", "man"}, {"ies", "y"}}},
{VERB, {{"s", ""}, {"ies", "y"}, {"es", "e"}, {"es", ""}, {"ed", "e"}, {"ed", ""}, {"ing", "e"}, {"ing", ""}}},
{ADJ, {{"er", ""}, {"est", ""}, {"er", "e"}, {"est", "e"}}},
{ADV, {}},
{ADJ_SAT, {{"er", ""}, {"est", ""}, {"er", "e"}, {"est", "e"}}}};

pos_names_ = [this] {
HashMap<int, String> names;
for (const auto &pair : pos_numbers_) {
names[pair.second] = pair.first;
}
return names;
}();

POS_LIST = {NOUN, VERB, ADJ, ADV};

LoadLemmaPosOffsetMap();
LoadExceptionMap();

return Status::OK();
}

Status Lemmatizer::LoadLemmaPosOffsetMap() {
for (const auto &pair : file_map_) {
const String &pos = pair.second;

std::ifstream file(path_ + "/" + "index." + pos);
if (!file.is_open()) {
return Status::InvalidAnalyzerFile(fmt::format("Failed to load wordnet lemmatizer, index.{}", pos));
}

String line;

while (std::getline(file, line)) {
if (line.empty() || line[0] == ' ') {
continue; // Skip comment lines or empty lines
}

std::istringstream stream(line);
try {
String lemma;
String pos;
std::getline(stream, lemma, ' ');
std::getline(stream, pos, ' ');

int n_synsets;
stream >> n_synsets;
assert(n_synsets > 0);

int n_pointers;
stream >> n_pointers;
// Ignore pointer symbols
for (int i = 0; i < n_pointers; ++i) {
String pointer_symbol;
stream >> pointer_symbol; // Dummy read
}

int n_senses;
stream >> n_senses;
assert(n_synsets == n_senses);

// Ignore the number of senses ranked by frequency
int n_ranked_senses;
stream >> n_ranked_senses;

// Get synset offsets
Vector<int> synset_offsets(n_synsets);
for (int i = 0; i < n_synsets; ++i) {
stream >> synset_offsets[i];
}

// Map lemmas and parts of speech to synsets
lemma_pos_offset_map_[lemma][pos] = synset_offsets;

if (pos == ADJ) {
// Duplicate all adjectives indiscriminately
lemma_pos_offset_map_[lemma][ADJ_SAT] = synset_offsets;
}

} catch (const std::exception &) {
return Status::InvalidAnalyzerFile("Failed to load wordnet lemmatizer");
}
}
}
return Status::OK();
}

void Lemmatizer::LoadExceptionMap() {
for (const auto &pair : file_map_) {
const auto &pos = pair.first;
const auto &suffix = pair.second;

std::ifstream file(path_ + "/" + suffix + ".exc");
std::cout << "exception file " << suffix + ".exc" << std::endl;
exception_map_[pos] = {};

String line;
while (std::getline(file, line)) {
std::istringstream stream(line);
String term;
stream >> term;
Vector<String> exceptions{std::istream_iterator<String>{stream}, std::istream_iterator<String>{}};
exception_map_[pos][term] = exceptions;
}
}
exception_map_[ADJ_SAT] = exception_map_.at(ADJ);
}

Vector<String> Lemmatizer::ApplyRules(const Vector<String> &forms, const String &pos) {
const auto &substitutions = MORPHOLOGICAL_SUBSTITUTIONS.at(pos);
Vector<String> results;
for (const auto &form : forms) {
for (const auto &[old, new_suffix] : substitutions) {
if (form.size() >= old.size() && form.compare(form.size() - old.size(), old.size(), old) == 0) {
results.push_back(form.substr(0, form.size() - old.size()) + new_suffix);
}
}
}
return results;
}

Vector<String> Lemmatizer::FilterForms(const Vector<String> &forms, const String &pos) {
Vector<String> result;
Set<String> seen;
for (const auto &form : forms) {
// Check if form exists in lemma_pos_offset_map_
if (lemma_pos_offset_map_.find(form) != lemma_pos_offset_map_.end()) {
if (lemma_pos_offset_map_[form].find(pos) != lemma_pos_offset_map_[form].end()) {
if (seen.find(form) == seen.end()) {
result.push_back(form);
seen.insert(form);
}
}
}
}
return result;
}

Vector<String> Lemmatizer::Morphy_(const String &form, const String &pos, bool check_exceptions) {
const auto &exceptions = exception_map_.at(pos);

Vector<String> forms;

if (check_exceptions && exceptions.find(form) != exceptions.end()) {
forms = exceptions.at(form);
} else {
forms = ApplyRules({form}, pos);
}

forms.push_back(form); // Include original form
return FilterForms(forms, pos);
}

String Lemmatizer::Lemmatize(const String &form, const String &pos) {
// If pos is specified, create a vector with that pos, otherwise use all POS_LIST
Vector<String> parts_of_speech;
if (!pos.empty()) {
parts_of_speech.push_back(pos);
} else {
parts_of_speech = POS_LIST;
}

for (const auto &part : parts_of_speech) {
auto analyses = Morphy_(form, part);
if (!analyses.empty()) {
// Return the first successful analysis
return analyses[0];
}
}

return form;
}

} // namespace infinity
56 changes: 56 additions & 0 deletions src/common/analyzer/lemmatizer.cppm
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// Copyright(C) 2024 InfiniFlow, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//
// https://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

module;

export module lemmatizer;

import stl;
import status;

namespace infinity {

export class Lemmatizer {
public:
Lemmatizer(const String &path);

~Lemmatizer();

Status Load();

String Lemmatize(const String &form, const String &pos = "");

private:
Status LoadLemmaPosOffsetMap();

void LoadExceptionMap();

Vector<String> Morphy_(const String &form, const String &pos, bool check_exceptions = true);

Vector<String> ApplyRules(const Vector<String> &forms, const String &pos);

Vector<String> FilterForms(const Vector<String> &forms, const String &pos);

String path_;

HashMap<String, HashMap<String, Vector<int>>> lemma_pos_offset_map_;
HashMap<String, HashMap<String, Vector<String>>> exception_map_;
HashMap<String, int> pos_numbers_;
HashMap<int, String> pos_names_;
HashMap<String, String> file_map_;
HashMap<String, Vector<Pair<String, String>>> MORPHOLOGICAL_SUBSTITUTIONS;
Vector<String> POS_LIST;
};

} // namespace infinity
Loading

0 comments on commit 8f4bcdc

Please sign in to comment.