diff --git a/CHANGELOG.md b/CHANGELOG.md index 59bcb448bbef2..ca23cd59e06d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Add new metric REMOTE_STORE to NodeStats API response ([#15611](https://github.com/opensearch-project/OpenSearch/pull/15611)) - [S3 Repository] Change default retry mechanism of s3 clients to Standard Mode ([#15978](https://github.com/opensearch-project/OpenSearch/pull/15978)) - Add changes to block calls in cat shards, indices and segments based on dynamic limit settings ([#15986](https://github.com/opensearch-project/OpenSearch/pull/15986)) +- New `phone` & `phone-search` analyzer + tokenizer ([#15915](https://github.com/opensearch-project/OpenSearch/pull/15915)) ### Dependencies - Bump `com.azure:azure-identity` from 1.13.0 to 1.13.2 ([#15578](https://github.com/opensearch-project/OpenSearch/pull/15578)) diff --git a/gradle/missing-javadoc.gradle b/gradle/missing-javadoc.gradle index e9a6d798b8323..26898673bf608 100644 --- a/gradle/missing-javadoc.gradle +++ b/gradle/missing-javadoc.gradle @@ -127,6 +127,7 @@ configure([ project(":plugins:analysis-icu"), project(":plugins:analysis-kuromoji"), project(":plugins:analysis-nori"), + project(":plugins:analysis-phonenumber"), project(":plugins:analysis-phonetic"), project(":plugins:analysis-smartcn"), project(":plugins:analysis-stempel"), diff --git a/libs/core/src/main/java/org/opensearch/core/common/Strings.java b/libs/core/src/main/java/org/opensearch/core/common/Strings.java index 8fdec670bd9f2..e8379e11ea26a 100644 --- a/libs/core/src/main/java/org/opensearch/core/common/Strings.java +++ b/libs/core/src/main/java/org/opensearch/core/common/Strings.java @@ -815,4 +815,17 @@ public static String toLowercaseAscii(String in) { } return out.toString(); } + + /** + * Check whether every single character in the string is a digit. + * + *

An empty string returns {@code false}.

+ * + * @param s the string, must not be null. + * @return {@code true} if the string only contains digits, {@code false} otherwise. + */ + public static boolean isDigits(final String s) { + return !s.isEmpty() && s.chars().allMatch(Character::isDigit); + } + } diff --git a/libs/core/src/test/java/org/opensearch/core/common/StringsTests.java b/libs/core/src/test/java/org/opensearch/core/common/StringsTests.java index b79bb6fc89f9e..be7af18b106a8 100644 --- a/libs/core/src/test/java/org/opensearch/core/common/StringsTests.java +++ b/libs/core/src/test/java/org/opensearch/core/common/StringsTests.java @@ -114,4 +114,15 @@ public void testToStringToXContentWithOrWithoutParams() { containsString("\"color_from_param\":\"blue\"") ); } + + public void testIsDigits() { + assertTrue(Strings.isDigits("1")); + assertTrue(Strings.isDigits("123")); + assertFalse(Strings.isDigits("")); + assertFalse(Strings.isDigits("abc")); + assertFalse(Strings.isDigits("123a")); + assertFalse(Strings.isDigits("0x123")); + assertFalse(Strings.isDigits("123.4")); + assertFalse(Strings.isDigits("123f")); + } } diff --git a/plugins/analysis-phonenumber/build.gradle b/plugins/analysis-phonenumber/build.gradle new file mode 100644 index 0000000000000..c9913b36f8508 --- /dev/null +++ b/plugins/analysis-phonenumber/build.gradle @@ -0,0 +1,21 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + * + * Modifications Copyright OpenSearch Contributors. See + * GitHub history for details. + */ + +apply plugin: 'opensearch.yaml-rest-test' + +opensearchplugin { + description 'Adds an analyzer for phone numbers to OpenSearch.' + classname 'org.opensearch.analysis.phone.PhoneNumberAnalysisPlugin' +} + +dependencies { + implementation group: 'com.googlecode.libphonenumber', name: 'libphonenumber', version: '8.13.45' +} diff --git a/plugins/analysis-phonenumber/licenses/libphonenumber-8.13.45.jar.sha1 b/plugins/analysis-phonenumber/licenses/libphonenumber-8.13.45.jar.sha1 new file mode 100644 index 0000000000000..00d393482ee49 --- /dev/null +++ b/plugins/analysis-phonenumber/licenses/libphonenumber-8.13.45.jar.sha1 @@ -0,0 +1 @@ +bfac00f71616796abc7d8b135dda12558a0ccee2 \ No newline at end of file diff --git a/plugins/analysis-phonenumber/licenses/libphonenumber-LICENSE.txt b/plugins/analysis-phonenumber/licenses/libphonenumber-LICENSE.txt new file mode 100644 index 0000000000000..d9a10c0d8e868 --- /dev/null +++ b/plugins/analysis-phonenumber/licenses/libphonenumber-LICENSE.txt @@ -0,0 +1,176 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS diff --git a/plugins/analysis-phonenumber/licenses/libphonenumber-NOTICE.txt b/plugins/analysis-phonenumber/licenses/libphonenumber-NOTICE.txt new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/plugins/analysis-phonenumber/src/main/java/org/opensearch/analysis/phone/PhoneNumberAnalysisPlugin.java b/plugins/analysis-phonenumber/src/main/java/org/opensearch/analysis/phone/PhoneNumberAnalysisPlugin.java new file mode 100644 index 0000000000000..eb12b43f70154 --- /dev/null +++ b/plugins/analysis-phonenumber/src/main/java/org/opensearch/analysis/phone/PhoneNumberAnalysisPlugin.java @@ -0,0 +1,60 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.analysis.phone; + +import org.apache.lucene.analysis.Analyzer; +import org.opensearch.index.analysis.AnalyzerProvider; +import org.opensearch.index.analysis.TokenizerFactory; +import org.opensearch.indices.analysis.AnalysisModule; +import org.opensearch.plugins.AnalysisPlugin; +import org.opensearch.plugins.Plugin; + +import java.util.Map; +import java.util.TreeMap; + +/** + * This plugin provides an analyzer and tokenizer for fields which contain phone numbers, supporting a variety of formats + * (with/without international calling code, different country formats, etc.). + */ +public class PhoneNumberAnalysisPlugin extends Plugin implements AnalysisPlugin { + + @Override + public Map>> getAnalyzers() { + Map>> analyzers = new TreeMap<>(); + analyzers.put( + "phone", + (indexSettings, environment, name, settings) -> new PhoneNumberAnalyzerProvider(indexSettings, "phone", settings, true) + ); + analyzers.put( + "phone-search", + (indexSettings, environment, name, settings) -> new PhoneNumberAnalyzerProvider(indexSettings, "phone-search", settings, false) + ); + return analyzers; + } + + @Override + public Map> getTokenizers() { + Map> tokenizers = new TreeMap<>(); + tokenizers.put( + "phone", + (indexSettings, environment, name, settings) -> new PhoneNumberTermTokenizerFactory(indexSettings, "phone", settings, true) + ); + tokenizers.put( + "phone-search", + (indexSettings, environment, name, settings) -> new PhoneNumberTermTokenizerFactory( + indexSettings, + "phone-search", + settings, + false + ) + ); + return tokenizers; + } + +} diff --git a/plugins/analysis-phonenumber/src/main/java/org/opensearch/analysis/phone/PhoneNumberAnalyzer.java b/plugins/analysis-phonenumber/src/main/java/org/opensearch/analysis/phone/PhoneNumberAnalyzer.java new file mode 100644 index 0000000000000..cd945e186b2ba --- /dev/null +++ b/plugins/analysis-phonenumber/src/main/java/org/opensearch/analysis/phone/PhoneNumberAnalyzer.java @@ -0,0 +1,51 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.analysis.phone; + +import org.apache.lucene.analysis.Analyzer; +import org.opensearch.common.settings.Settings; + +/** + * Analyzer for phone numbers, using {@link PhoneNumberTermTokenizer}. + * + *

+ * You can use the {@code phone} and {@code phone-search} analyzers on your fields to index phone numbers. + * Use {@code phone} (which creates ngrams) for the {@code analyzer} and {@code phone-search} (which doesn't create ngrams) + * for the {@code search_analyzer}. + *

+ * + *

+ * You optionally can specify a region with the {@code phone-region} setting for the phone number which will ensure that + * phone numbers without the international dialling prefix (using {@code +}) are also tokenized correctly. + *

+ * + *

+ * Note that the tokens will not refer to a specific position in the stream as the tokenizer is expected to be used on strings + * containing phone numbers and not arbitrary text with interspersed phone numbers. + *

+ */ +public class PhoneNumberAnalyzer extends Analyzer { + private final boolean addNgrams; + private final Settings settings; + + /** + * @param addNgrams defines whether ngrams for the phone number should be added. Set to true for indexing and false for search. + * @param settings the settings for the analyzer. + */ + public PhoneNumberAnalyzer(final Settings settings, final boolean addNgrams) { + this.addNgrams = addNgrams; + this.settings = settings; + } + + @Override + protected TokenStreamComponents createComponents(String fieldName) { + final var tokenizer = new PhoneNumberTermTokenizer(this.settings, this.addNgrams); + return new Analyzer.TokenStreamComponents(tokenizer, tokenizer); + } +} diff --git a/plugins/analysis-phonenumber/src/main/java/org/opensearch/analysis/phone/PhoneNumberAnalyzerProvider.java b/plugins/analysis-phonenumber/src/main/java/org/opensearch/analysis/phone/PhoneNumberAnalyzerProvider.java new file mode 100644 index 0000000000000..272a019ba0f9c --- /dev/null +++ b/plugins/analysis-phonenumber/src/main/java/org/opensearch/analysis/phone/PhoneNumberAnalyzerProvider.java @@ -0,0 +1,42 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.analysis.phone; + +import org.opensearch.common.settings.Settings; +import org.opensearch.index.IndexSettings; +import org.opensearch.index.analysis.AbstractIndexAnalyzerProvider; + +/** + * Provider for {@link PhoneNumberAnalyzer}. + */ +public class PhoneNumberAnalyzerProvider extends AbstractIndexAnalyzerProvider { + + private final PhoneNumberAnalyzer analyzer; + + /** + * @param indexSettings the settings of the index. + * @param name the analyzer name. + * @param settings the settings for the analyzer. + * @param addNgrams defines whether ngrams for the phone number should be added. Set to true for indexing and false for search. + */ + public PhoneNumberAnalyzerProvider( + final IndexSettings indexSettings, + final String name, + final Settings settings, + final boolean addNgrams + ) { + super(indexSettings, name, settings); + this.analyzer = new PhoneNumberAnalyzer(settings, addNgrams); + } + + @Override + public PhoneNumberAnalyzer get() { + return this.analyzer; + } +} diff --git a/plugins/analysis-phonenumber/src/main/java/org/opensearch/analysis/phone/PhoneNumberTermTokenizer.java b/plugins/analysis-phonenumber/src/main/java/org/opensearch/analysis/phone/PhoneNumberTermTokenizer.java new file mode 100644 index 0000000000000..6b95594204eb4 --- /dev/null +++ b/plugins/analysis-phonenumber/src/main/java/org/opensearch/analysis/phone/PhoneNumberTermTokenizer.java @@ -0,0 +1,157 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.analysis.phone; + +import com.google.i18n.phonenumbers.NumberParseException; +import com.google.i18n.phonenumbers.PhoneNumberUtil; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.opensearch.common.io.Streams; +import org.opensearch.common.settings.Settings; +import org.opensearch.core.common.Strings; + +import java.io.IOException; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Optional; +import java.util.Set; + +/** + * This tokenizes a phone number into its individual parts, using {@link PhoneNumberUtil}. + * + *

+ * You can use the {@code phone} and {@code phone-search} analyzers on your fields to index phone numbers. + * Use {@code phone} (which creates ngrams) for the {@code analyzer} and {@code phone-search} (which doesn't create ngrams) + * for the {@code search_analyzer}. + *

+ * + *

+ * You optionally can specify a region with the {@code phone-region} setting for the phone number which will ensure that + * phone numbers without the international dialling prefix (using {@code +}) are also tokenized correctly. + *

+ * + *

+ * Note that the tokens will not refer to a specific position in the stream as the tokenizer is expected to be used on strings + * containing phone numbers and not arbitrary text with interspersed phone numbers. + *

+ */ +public final class PhoneNumberTermTokenizer extends Tokenizer { + private final boolean addNgrams; + private final Settings settings; + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private Iterator tokenIterator; + + /** + * @param addNgrams defines whether ngrams for the phone number should be added. Set to true for indexing and false for search. + * @param settings the settings for the analyzer. + */ + public PhoneNumberTermTokenizer(final Settings settings, final boolean addNgrams) { + super(); + this.addNgrams = addNgrams; + this.settings = settings; + } + + /** {@inheritDoc} */ + @Override + public void reset() throws IOException { + super.reset(); + tokenIterator = null; + } + + /** {@inheritDoc} */ + @Override + public boolean incrementToken() throws IOException { + clearAttributes(); + if (tokenIterator == null) { + tokenIterator = getTokens().iterator(); + } + if (tokenIterator.hasNext()) { + termAtt.append(tokenIterator.next()); + return true; + } + return false; + } + + /** + * Search for a phone number in the input and tokenize it. + * + *

+ * The tokens include the full phone number with and without country prefix (if it could be identified) and - if + * enabled by {@link #addNgrams} - an ngram of the phone number. + *

+ * + * @return all tokens (unique, unordered). + * @throws IOException in case the input cannot be read. + */ + private Set getTokens() throws IOException { + final var tokens = new HashSet(); + + var input = Streams.copyToString(this.input); + + tokens.add(input); + + // Rip off the "tel:" or "sip:" prefix + if (input.indexOf("tel:") == 0 || input.indexOf("sip:") == 0) { + tokens.add(input.substring(0, 4)); + input = input.substring(4); + } + + final var startIndex = input.startsWith("+") ? 1 : 0; + // Add the complete input but skip a leading + + tokens.add(input.substring(startIndex)); + + // Drop anything after @. Most likely there's nothing of interest + final var posAt = input.indexOf('@'); + if (posAt != -1) { + input = input.substring(0, posAt); + + // Add a token for the raw unmanipulated address. Note this could be a username (sip) instead of telephone + // number so take it as is + tokens.add(input.substring(startIndex)); + } + + // Let google's libphone try to parse it + final var phoneUtil = PhoneNumberUtil.getInstance(); + Optional countryCode = Optional.empty(); + try { + // ZZ is the generic "I don't know the country code" region. Google's libphone library will try to infer it. + final var region = this.settings.get("phone-region", "ZZ"); + final var numberProto = phoneUtil.parse(input, region); + if (numberProto != null) { + // Libphone likes it! + countryCode = Optional.of(String.valueOf(numberProto.getCountryCode())); + input = String.valueOf(numberProto.getNationalNumber()); + + // Add Country code, extension, and the number as tokens + tokens.add(countryCode.get()); + tokens.add(countryCode.get() + input); + if (!Strings.isEmpty(numberProto.getExtension())) { + tokens.add(numberProto.getExtension()); + } + + tokens.add(input); + } + } catch (final NumberParseException | StringIndexOutOfBoundsException e) { + // Libphone didn't like it, no biggie. We'll just ngram the number as it is. + } + + // ngram the phone number, e.g. 19198243333 produces 9, 91, 919, etc + if (this.addNgrams && Strings.isDigits(input)) { + for (int count = 1; count <= input.length(); ++count) { + final var token = input.substring(0, count); + tokens.add(token); + // If there was a country code, add more ngrams such that 19198243333 produces 19, 191, 1919, etc + countryCode.ifPresent(s -> tokens.add(s + token)); + } + } + + return tokens; + } + +} diff --git a/plugins/analysis-phonenumber/src/main/java/org/opensearch/analysis/phone/PhoneNumberTermTokenizerFactory.java b/plugins/analysis-phonenumber/src/main/java/org/opensearch/analysis/phone/PhoneNumberTermTokenizerFactory.java new file mode 100644 index 0000000000000..fde44e15c9667 --- /dev/null +++ b/plugins/analysis-phonenumber/src/main/java/org/opensearch/analysis/phone/PhoneNumberTermTokenizerFactory.java @@ -0,0 +1,44 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.analysis.phone; + +import org.apache.lucene.analysis.Tokenizer; +import org.opensearch.common.settings.Settings; +import org.opensearch.index.IndexSettings; +import org.opensearch.index.analysis.AbstractTokenizerFactory; + +/** + * Factory for {@link PhoneNumberTermTokenizer}. + */ +public class PhoneNumberTermTokenizerFactory extends AbstractTokenizerFactory { + private final Settings settings; + private final boolean addNgrams; + + /** + * @param indexSettings the settings of the index. + * @param name the tokenizer name. + * @param settings the settings for the analyzer. + * @param addNgrams defines whether ngrams for the phone number should be added. Set to true for indexing and false for search. + */ + public PhoneNumberTermTokenizerFactory( + final IndexSettings indexSettings, + final String name, + final Settings settings, + final boolean addNgrams + ) { + super(indexSettings, settings, name); + this.settings = settings; + this.addNgrams = addNgrams; + } + + @Override + public Tokenizer create() { + return new PhoneNumberTermTokenizer(this.settings, this.addNgrams); + } +} diff --git a/plugins/analysis-phonenumber/src/test/java/org/opensearch/analysis/phone/PhoneNumberAnalyzerTests.java b/plugins/analysis-phonenumber/src/test/java/org/opensearch/analysis/phone/PhoneNumberAnalyzerTests.java new file mode 100644 index 0000000000000..332f6d21f47d6 --- /dev/null +++ b/plugins/analysis-phonenumber/src/test/java/org/opensearch/analysis/phone/PhoneNumberAnalyzerTests.java @@ -0,0 +1,253 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.analysis.phone; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.opensearch.index.analysis.AnalysisTestsHelper; +import org.opensearch.test.OpenSearchTokenStreamTestCase; +import org.junit.BeforeClass; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static org.hamcrest.Matchers.arrayContainingInAnyOrder; +import static org.hamcrest.Matchers.hasItemInArray; + +public class PhoneNumberAnalyzerTests extends OpenSearchTokenStreamTestCase { + private static final String RESOURCE = "/org/opensearch/analysis/phone/phone_analysis.json"; + + private static Analyzer phoneAnalyzer; + private static Analyzer phoneSearchAnalyzer; + private static Analyzer phoneCHAnalyzer; + private static Analyzer phoneSearchCHAnalyzer; + + @BeforeClass + public static void beforeClass() throws IOException { + final var analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath( + createTempDir(), + RESOURCE, + new PhoneNumberAnalysisPlugin() + ); + phoneAnalyzer = analysis.indexAnalyzers.get("phone"); + assertNotNull(phoneAnalyzer); + phoneSearchAnalyzer = analysis.indexAnalyzers.get("phone-search"); + assertNotNull(phoneSearchAnalyzer); + phoneCHAnalyzer = analysis.indexAnalyzers.get("phone-ch"); + assertNotNull(phoneCHAnalyzer); + phoneSearchCHAnalyzer = analysis.indexAnalyzers.get("phone-search-ch"); + assertNotNull(phoneSearchCHAnalyzer); + } + + /** + * Test for all tokens which are emitted by the "phone" analyzer. + */ + public void testEuropeDetailled() throws IOException { + assertTokensAreInAnyOrder( + phoneAnalyzer, + "tel:+441344840400", + Arrays.asList( + "tel:+441344840400", + "tel:", + "441344840400", + "44", + "1344840400", + "1", + "441", + "13", + "4413", + "134", + "44134", + "1344", + "441344", + "13448", + "4413448", + "134484", + "44134484", + "1344840", + "441344840", + "13448404", + "4413448404", + "134484040", + "44134484040" + ) + ); + } + + /** + * Test for all tokens which are emitted by the "phone" analyzer. + */ + public void testEuropeDetailledSearch() throws IOException { + assertTokensAreInAnyOrder( + phoneSearchAnalyzer, + "tel:+441344840400", + Arrays.asList("tel:+441344840400", "tel:", "441344840400", "44", "1344840400") + ); + } + + public void testEurope() throws IOException { + assertTokensInclude("tel:+441344840400", Arrays.asList("44", "1344", "1344840400", "441344840400")); + } + + public void testGermanCastle() throws IOException { + assertTokensInclude("tel:+498362930830", Arrays.asList("49", "498362930830", "8362930830")); + } + + public void testBMWofSydney() throws IOException { + assertTokensInclude("tel:+61293344555", Arrays.asList("61", "293344555", "61293344555")); + } + + public void testCoffeeShopInIreland() throws IOException { + assertTokensInclude("tel:+442890319416", Arrays.asList("44", "289", "2890319416", "442890319416")); + } + + public void testTelWithCountryCode() throws IOException { + assertTokensInclude("tel:+17177158163", Arrays.asList("1", "717", "7177", "17177158163")); + } + + public void testTelWithCountryCode2() throws IOException { + assertTokensInclude("tel:+12177148350", Arrays.asList("1", "217", "2177", "2177148350", "12177148350")); + } + + public void testNewTollFreeNumber() throws IOException { + assertTokensInclude("tel:+18337148350", Arrays.asList("1", "833", "8337", "8337148350", "18337148350")); + } + + public void testMissingCountryCode() throws IOException { + assertTokensInclude("tel:8177148350", Arrays.asList("817", "8177", "81771", "817714", "8177148350")); + } + + public void testSipWithNumericUsername() throws IOException { + assertTokensInclude("sip:222@autosbcpc", Arrays.asList("222")); + } + + public void testTruncatedNumber() throws IOException { + assertTokensInclude("tel:5551234", Arrays.asList("5551234")); + } + + public void testSipWithAlphabeticUsername() throws IOException { + assertTokensInclude("sip:abc@autosbcpc", Arrays.asList("abc")); + } + + public void testGarbageInGarbageOut() throws IOException { + assertTokensInclude("test", Arrays.asList("test")); + } + + public void testSipWithCountryCode() throws IOException { + assertTokensInclude("sip:+14177141363@178.97.105.13;isup-oli=0;pstn-params=808481808882", Arrays.asList("417", "4177", "14177")); + } + + public void testSipWithTelephoneExtension() throws IOException { + assertTokensInclude("sip:+13169410766;ext=2233@178.17.10.117:8060", Arrays.asList("316", "2233", "1316")); + } + + public void testSipWithUsername() throws IOException { + assertTokensInclude("sip:JeffSIP@178.12.220.18", Arrays.asList("JeffSIP")); + } + + public void testPhoneNumberWithoutPrefix() throws IOException { + assertTokensInclude("+14177141363", Arrays.asList("14177141363", "417", "4177", "14177")); + } + + public void testSipWithoutDomainPart() throws IOException { + assertTokensInclude("sip:+122882", Arrays.asList("122882", "122", "228", "1228", "2288", "12288")); + } + + public void testTelPrefix() throws IOException { + assertTokensInclude("tel:+1228", Arrays.asList("1228", "122", "228")); + } + + public void testNumberPrefix() throws IOException { + assertTokensInclude("+1228", Arrays.asList("1228", "122", "228")); + } + + public void testInternationalPrefixWithZZ() throws IOException { + assertTokensInclude(phoneAnalyzer, "+41583161010", Arrays.asList("41", "41583161010", "583161010")); + } + + public void testInternationalPrefixWithCH() throws IOException { + assertTokensInclude(phoneCHAnalyzer, "+41583161010", Arrays.asList("41", "41583161010", "583161010")); + } + + public void testNationalPrefixWithCH() throws IOException { + // + is equivalent to 00 in Switzerland + assertTokensInclude(phoneCHAnalyzer, "0041583161010", Arrays.asList("41", "41583161010", "583161010")); + } + + public void testLocalNumberWithCH() throws IOException { + // when omitting the international prefix swiss numbers must start with '0' + assertTokensInclude(phoneCHAnalyzer, "0583161010", Arrays.asList("41", "41583161010", "583161010")); + } + + public void testSearchInternationalPrefixWithZZ() throws IOException { + assertTokensInclude(phoneSearchAnalyzer, "+41583161010", Arrays.asList("41", "41583161010", "583161010")); + } + + public void testSearchInternationalPrefixWithCH() throws IOException { + assertTokensInclude(phoneSearchCHAnalyzer, "+41583161010", Arrays.asList("41", "41583161010", "583161010")); + } + + public void testSearchNationalPrefixWithCH() throws IOException { + // + is equivalent to 00 in Switzerland + assertTokensInclude(phoneSearchCHAnalyzer, "0041583161010", Arrays.asList("41", "41583161010", "583161010")); + } + + public void testSearchLocalNumberWithCH() throws IOException { + // when omitting the international prefix swiss numbers must start with '0' + assertTokensInclude(phoneSearchCHAnalyzer, "0583161010", Arrays.asList("41", "41583161010", "583161010")); + } + + /** + * Unlike {@link #assertTokenStreamContents(TokenStream, String[])} this only asserts whether the generated tokens + * contain the required ones but does not check for order. Use {@link #assertTokensInclude} if completeness is not needed. + */ + private void assertTokensAreInAnyOrder(final Analyzer analyzer, final String input, final List expectedTokens) + throws IOException { + final var ts = analyzer.tokenStream("test", input); + final var allTokens = getAllTokens(ts).toArray(); + assertThat(allTokens, arrayContainingInAnyOrder(expectedTokens.toArray())); + } + + /** + * Unlike {@link #assertTokenStreamContents(TokenStream, String[])} this only asserts whether the generated tokens + * contain the required ones but does not check for completeness or order. + */ + private void assertTokensInclude(final Analyzer analyzer, final String input, final List expectedTokens) throws IOException { + final var ts = analyzer.tokenStream("test", input); + final var allTokens = getAllTokens(ts).toArray(); + for (final var expectedToken : expectedTokens) { + assertThat(allTokens, hasItemInArray(expectedToken)); + } + } + + /** + * Unlike {@link #assertTokenStreamContents(TokenStream, String[])} this only asserts whether the generated tokens + * contain the required ones but does not check for completeness or order. + * This uses {@link #phoneAnalyzer}. + */ + private void assertTokensInclude(final String input, final List expectedTokens) throws IOException { + this.assertTokensInclude(phoneAnalyzer, input, expectedTokens); + } + + private List getAllTokens(final TokenStream ts) throws IOException { + final var tokens = new ArrayList(); + final var termAtt = ts.getAttribute(CharTermAttribute.class); + ts.reset(); + while (ts.incrementToken()) { + tokens.add(termAtt.toString()); + } + ts.end(); + ts.close(); + return tokens; + } + +} diff --git a/plugins/analysis-phonenumber/src/test/resources/org/opensearch/analysis/phone/phone_analysis.json b/plugins/analysis-phonenumber/src/test/resources/org/opensearch/analysis/phone/phone_analysis.json new file mode 100644 index 0000000000000..7e45177c57492 --- /dev/null +++ b/plugins/analysis-phonenumber/src/test/resources/org/opensearch/analysis/phone/phone_analysis.json @@ -0,0 +1,22 @@ +{ + "index": { + "analysis": { + "analyzer": { + "phone": { + "type": "phone" + }, + "phone-search": { + "type": "phone-search" + }, + "phone-ch": { + "type": "phone", + "phone-region": "CH" + }, + "phone-search-ch": { + "type": "phone-search", + "phone-region": "CH" + } + } + } + } +} diff --git a/plugins/analysis-phonenumber/src/yamlRestTest/java/org/opensearch/analysis/phone/PhoneNumberAnalysisClientYamlTestSuiteIT.java b/plugins/analysis-phonenumber/src/yamlRestTest/java/org/opensearch/analysis/phone/PhoneNumberAnalysisClientYamlTestSuiteIT.java new file mode 100644 index 0000000000000..d514a3329a1a7 --- /dev/null +++ b/plugins/analysis-phonenumber/src/yamlRestTest/java/org/opensearch/analysis/phone/PhoneNumberAnalysisClientYamlTestSuiteIT.java @@ -0,0 +1,49 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Modifications Copyright OpenSearch Contributors. See + * GitHub history for details. + */ + +package org.opensearch.analysis.phone; + +import com.carrotsearch.randomizedtesting.annotations.Name; +import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; + +import org.opensearch.test.rest.yaml.ClientYamlTestCandidate; +import org.opensearch.test.rest.yaml.OpenSearchClientYamlSuiteTestCase; + +public class PhoneNumberAnalysisClientYamlTestSuiteIT extends OpenSearchClientYamlSuiteTestCase { + public PhoneNumberAnalysisClientYamlTestSuiteIT(@Name("yaml") ClientYamlTestCandidate testCandidate) { + super(testCandidate); + } + + @ParametersFactory + public static Iterable parameters() throws Exception { + return OpenSearchClientYamlSuiteTestCase.createParameters(); + } +} diff --git a/plugins/analysis-phonenumber/src/yamlRestTest/resources/rest-api-spec/test/analysis-phone/10_basic.yml b/plugins/analysis-phonenumber/src/yamlRestTest/resources/rest-api-spec/test/analysis-phone/10_basic.yml new file mode 100644 index 0000000000000..5bea0cf4650d6 --- /dev/null +++ b/plugins/analysis-phonenumber/src/yamlRestTest/resources/rest-api-spec/test/analysis-phone/10_basic.yml @@ -0,0 +1,8 @@ +"Test that the plugin is loaded in OpenSearch": + - do: + cat.plugins: + local: true + h: component + + - match: + $body: /^analysis-phonenumber\n$/ diff --git a/plugins/analysis-phonenumber/src/yamlRestTest/resources/rest-api-spec/test/analysis-phone/20_search.yml b/plugins/analysis-phonenumber/src/yamlRestTest/resources/rest-api-spec/test/analysis-phone/20_search.yml new file mode 100644 index 0000000000000..0bd7d2c371bfc --- /dev/null +++ b/plugins/analysis-phonenumber/src/yamlRestTest/resources/rest-api-spec/test/analysis-phone/20_search.yml @@ -0,0 +1,56 @@ +# Integration tests for phone analysis components +# +--- +"Index phone number content": + - do: + indices.create: + index: test + body: + settings: + index: + analysis: + analyzer: + phone-ch: + type: "phone" + "phone-region": "CH" + phone-search-ch: + type: "phone-search" + "phone-region": "CH" + mappings: + properties: + phone: + type: text + analyzer: "phone" + search_analyzer: "phone-search" + phone-ch: + type: text + analyzer: "phone-ch" + search_analyzer: "phone-search-ch" + + - do: + index: + index: test + id: 1 + body: { "phone": "+41 58 316 10 10", "phone-ch": "058 316 10 10" } + - do: + indices.refresh: {} + + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + "phone": "+41583161010" + - match: { hits.total: 1 } + + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + "phone-ch": "+41583161010" + - match: { hits.total: 1 }