diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java index cf2736a8583d2..c0db55a884679 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java @@ -117,6 +117,7 @@ import org.apache.lucene.analysis.ru.RussianAnalyzer; import org.apache.lucene.analysis.shingle.ShingleFilter; import org.apache.lucene.analysis.snowball.SnowballFilter; +import org.apache.lucene.analysis.sr.SerbianAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.sv.SwedishAnalyzer; import org.apache.lucene.analysis.th.ThaiAnalyzer; @@ -237,6 +238,7 @@ public Map>> getAn analyzers.put("portuguese", PortugueseAnalyzerProvider::new); analyzers.put("romanian", RomanianAnalyzerProvider::new); analyzers.put("russian", RussianAnalyzerProvider::new); + analyzers.put("serbian", SerbianAnalyzerProvider::new); analyzers.put("sorani", SoraniAnalyzerProvider::new); analyzers.put("spanish", SpanishAnalyzerProvider::new); analyzers.put("swedish", SwedishAnalyzerProvider::new); @@ -469,6 +471,7 @@ public List getPreBuiltAnalyzerProviderFactorie analyzers.add(new PreBuiltAnalyzerProviderFactory("portuguese", CachingStrategy.LUCENE, PortugueseAnalyzer::new)); analyzers.add(new PreBuiltAnalyzerProviderFactory("romanian", CachingStrategy.LUCENE, RomanianAnalyzer::new)); analyzers.add(new PreBuiltAnalyzerProviderFactory("russian", CachingStrategy.LUCENE, RussianAnalyzer::new)); + analyzers.add(new PreBuiltAnalyzerProviderFactory("serbian", CachingStrategy.LUCENE, SerbianAnalyzer::new)); analyzers.add(new PreBuiltAnalyzerProviderFactory("sorani", CachingStrategy.LUCENE, SoraniAnalyzer::new)); analyzers.add(new PreBuiltAnalyzerProviderFactory("spanish", CachingStrategy.LUCENE, SpanishAnalyzer::new)); analyzers.add(new PreBuiltAnalyzerProviderFactory("swedish", CachingStrategy.LUCENE, SwedishAnalyzer::new)); diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/SerbianAnalyzerProvider.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/SerbianAnalyzerProvider.java new file mode 100644 index 0000000000000..99077d433ccef --- /dev/null +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/SerbianAnalyzerProvider.java @@ -0,0 +1,61 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Modifications Copyright OpenSearch Contributors. See + * GitHub history for details. + */ + +package org.opensearch.analysis.common; + +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.sr.SerbianAnalyzer; +import org.opensearch.common.settings.Settings; +import org.opensearch.env.Environment; +import org.opensearch.index.IndexSettings; +import org.opensearch.index.analysis.AbstractIndexAnalyzerProvider; +import org.opensearch.index.analysis.Analysis; + +public class SerbianAnalyzerProvider extends AbstractIndexAnalyzerProvider { + + private final SerbianAnalyzer analyzer; + + SerbianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { + + super(indexSettings, name, settings); + analyzer = new SerbianAnalyzer( + Analysis.parseStopWords(env, settings, SerbianAnalyzer.getDefaultStopSet()), + Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) + ); + } + + @Override + public SerbianAnalyzer get() { + + return this.analyzer; + } +} diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/StemmerTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/StemmerTokenFilterFactory.java index 5506626e40da0..547a1f5cc008f 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/StemmerTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/StemmerTokenFilterFactory.java @@ -96,6 +96,7 @@ import org.tartarus.snowball.ext.PortugueseStemmer; import org.tartarus.snowball.ext.RomanianStemmer; import org.tartarus.snowball.ext.RussianStemmer; +import org.tartarus.snowball.ext.SerbianStemmer; import org.tartarus.snowball.ext.SpanishStemmer; import org.tartarus.snowball.ext.SwedishStemmer; import org.tartarus.snowball.ext.TurkishStemmer; @@ -259,6 +260,9 @@ public TokenStream create(TokenStream tokenStream) { } else if ("light_russian".equalsIgnoreCase(language) || "lightRussian".equalsIgnoreCase(language)) { return new RussianLightStemFilter(tokenStream); + } else if ("serbian".equalsIgnoreCase(language)) { + return new SnowballFilter(tokenStream, new SerbianStemmer()); + // Spanish stemmers } else if ("spanish".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new SpanishStemmer()); diff --git a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/20_analyzers.yml b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/20_analyzers.yml index dcec02729a44e..d0e71fcd7a796 100644 --- a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/20_analyzers.yml +++ b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/20_analyzers.yml @@ -988,6 +988,35 @@ - length: { tokens: 1 } - match: { tokens.0.token: вмест } +--- +"serbian": + - do: + indices.create: + index: test + body: + settings: + analysis: + analyzer: + my_analyzer: + type: serbian + + - do: + indices.analyze: + body: + text: можемо заједно + analyzer: serbian + - length: { tokens: 1 } + - match: { tokens.0.token: zajedn } + + - do: + indices.analyze: + index: test + body: + text: можемо заједно + analyzer: my_analyzer + - length: { tokens: 1 } + - match: { tokens.0.token: zajedn } + --- "sorani": - do: diff --git a/server/src/main/java/org/opensearch/index/analysis/Analysis.java b/server/src/main/java/org/opensearch/index/analysis/Analysis.java index b9a219057f326..c380dfaa2e021 100644 --- a/server/src/main/java/org/opensearch/index/analysis/Analysis.java +++ b/server/src/main/java/org/opensearch/index/analysis/Analysis.java @@ -66,6 +66,7 @@ import org.apache.lucene.analysis.pt.PortugueseAnalyzer; import org.apache.lucene.analysis.ro.RomanianAnalyzer; import org.apache.lucene.analysis.ru.RussianAnalyzer; +import org.apache.lucene.analysis.sr.SerbianAnalyzer; import org.apache.lucene.analysis.sv.SwedishAnalyzer; import org.apache.lucene.analysis.th.ThaiAnalyzer; import org.apache.lucene.analysis.tr.TurkishAnalyzer; @@ -148,6 +149,7 @@ public static CharArraySet parseStemExclusion(Settings settings, CharArraySet de namedStopWords.put("_portuguese_", PortugueseAnalyzer.getDefaultStopSet()); namedStopWords.put("_romanian_", RomanianAnalyzer.getDefaultStopSet()); namedStopWords.put("_russian_", RussianAnalyzer.getDefaultStopSet()); + namedStopWords.put("_serbian_", SerbianAnalyzer.getDefaultStopSet()); namedStopWords.put("_sorani_", SoraniAnalyzer.getDefaultStopSet()); namedStopWords.put("_spanish_", SpanishAnalyzer.getDefaultStopSet()); namedStopWords.put("_swedish_", SwedishAnalyzer.getDefaultStopSet());