Introduce Serbian analyzer

opensearch-project · Jun 27, 2024 · 4b41c2a · 4b41c2a
1 parent 0eb39ae
commit 4b41c2a
Show file tree

Hide file tree

Showing 5 changed files with 100 additions and 1 deletion.
diff --git a/...lysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java b/...lysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java
@@ -117,6 +117,7 @@
 import org.apache.lucene.analysis.ru.RussianAnalyzer;
 import org.apache.lucene.analysis.shingle.ShingleFilter;
 import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.sr.SerbianAnalyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.sv.SwedishAnalyzer;
 import org.apache.lucene.analysis.th.ThaiAnalyzer;
@@ -237,6 +238,7 @@ public Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAn
         analyzers.put("portuguese", PortugueseAnalyzerProvider::new);
         analyzers.put("romanian", RomanianAnalyzerProvider::new);
         analyzers.put("russian", RussianAnalyzerProvider::new);
+        analyzers.put("serbian", SerbianAnalyzerProvider::new);
         analyzers.put("sorani", SoraniAnalyzerProvider::new);
         analyzers.put("spanish", SpanishAnalyzerProvider::new);
         analyzers.put("swedish", SwedishAnalyzerProvider::new);
@@ -469,6 +471,7 @@ public List<PreBuiltAnalyzerProviderFactory> getPreBuiltAnalyzerProviderFactorie
         analyzers.add(new PreBuiltAnalyzerProviderFactory("portuguese", CachingStrategy.LUCENE, PortugueseAnalyzer::new));
         analyzers.add(new PreBuiltAnalyzerProviderFactory("romanian", CachingStrategy.LUCENE, RomanianAnalyzer::new));
         analyzers.add(new PreBuiltAnalyzerProviderFactory("russian", CachingStrategy.LUCENE, RussianAnalyzer::new));
+        analyzers.add(new PreBuiltAnalyzerProviderFactory("serbian", CachingStrategy.LUCENE, SerbianAnalyzer::new));
         analyzers.add(new PreBuiltAnalyzerProviderFactory("sorani", CachingStrategy.LUCENE, SoraniAnalyzer::new));
         analyzers.add(new PreBuiltAnalyzerProviderFactory("spanish", CachingStrategy.LUCENE, SpanishAnalyzer::new));
         analyzers.add(new PreBuiltAnalyzerProviderFactory("swedish", CachingStrategy.LUCENE, SwedishAnalyzer::new));

diff --git a/...analysis-common/src/main/java/org/opensearch/analysis/common/SerbianAnalyzerProvider.java b/...analysis-common/src/main/java/org/opensearch/analysis/common/SerbianAnalyzerProvider.java
@@ -0,0 +1,61 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Modifications Copyright OpenSearch Contributors. See
+ * GitHub history for details.
+ */
+
+package org.opensearch.analysis.common;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.sr.SerbianAnalyzer;
+import org.opensearch.common.settings.Settings;
+import org.opensearch.env.Environment;
+import org.opensearch.index.IndexSettings;
+import org.opensearch.index.analysis.AbstractIndexAnalyzerProvider;
+import org.opensearch.index.analysis.Analysis;
+
+public class SerbianAnalyzerProvider extends AbstractIndexAnalyzerProvider<SerbianAnalyzer> {
+
+    private final SerbianAnalyzer analyzer;
+
+    SerbianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+
+        super(indexSettings, name, settings);
+        analyzer = new SerbianAnalyzer(
+            Analysis.parseStopWords(env, settings, SerbianAnalyzer.getDefaultStopSet()),
+            Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
+        );
+    }
+
+    @Override
+    public SerbianAnalyzer get() {
+
+        return this.analyzer;
+    }
+}
diff --git a/...alysis-common/src/main/java/org/opensearch/analysis/common/StemmerTokenFilterFactory.java b/...alysis-common/src/main/java/org/opensearch/analysis/common/StemmerTokenFilterFactory.java
@@ -96,6 +96,7 @@
 import org.tartarus.snowball.ext.PortugueseStemmer;
 import org.tartarus.snowball.ext.RomanianStemmer;
 import org.tartarus.snowball.ext.RussianStemmer;
+import org.tartarus.snowball.ext.SerbianStemmer;
 import org.tartarus.snowball.ext.SpanishStemmer;
 import org.tartarus.snowball.ext.SwedishStemmer;
 import org.tartarus.snowball.ext.TurkishStemmer;
@@ -257,7 +258,10 @@ public TokenStream create(TokenStream tokenStream) {
             } else if ("russian".equalsIgnoreCase(language)) {
                 return new SnowballFilter(tokenStream, new RussianStemmer());
             } else if ("light_russian".equalsIgnoreCase(language) || "lightRussian".equalsIgnoreCase(language)) {
-                return new RussianLightStemFilter(tokenStream);
+            return new RussianLightStemFilter(tokenStream);
+
+            } else if ("serbian".equalsIgnoreCase(language)) {
+                return new SnowballFilter(tokenStream, new SerbianStemmer());
 
                 // Spanish stemmers
             } else if ("spanish".equalsIgnoreCase(language)) {

diff --git a/...sis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/20_analyzers.yml b/...sis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/20_analyzers.yml
@@ -988,6 +988,35 @@
     - length: { tokens: 1 }
     - match:  { tokens.0.token: вмест }
 
+---
+"serbian":
+  - do:
+      indices.create:
+        index: test
+        body:
+          settings:
+            analysis:
+              analyzer:
+                my_analyzer:
+                  type: serbian
+
+  - do:
+      indices.analyze:
+        body:
+          text: можемо заједно
+          analyzer: serbian
+  - length: { tokens: 1 }
+  - match: { tokens.0.token: zajedn }
+
+  - do:
+      indices.analyze:
+        index: test
+        body:
+          text: можемо заједно
+          analyzer: my_analyzer
+  - length: { tokens: 1 }
+  - match: { tokens.0.token: zajedn }
+
 ---
 "sorani":
     - do:

diff --git a/server/src/main/java/org/opensearch/index/analysis/Analysis.java b/server/src/main/java/org/opensearch/index/analysis/Analysis.java
@@ -66,6 +66,7 @@
 import org.apache.lucene.analysis.pt.PortugueseAnalyzer;
 import org.apache.lucene.analysis.ro.RomanianAnalyzer;
 import org.apache.lucene.analysis.ru.RussianAnalyzer;
+import org.apache.lucene.analysis.sr.SerbianAnalyzer;
 import org.apache.lucene.analysis.sv.SwedishAnalyzer;
 import org.apache.lucene.analysis.th.ThaiAnalyzer;
 import org.apache.lucene.analysis.tr.TurkishAnalyzer;
@@ -148,6 +149,7 @@ public static CharArraySet parseStemExclusion(Settings settings, CharArraySet de
         namedStopWords.put("_portuguese_", PortugueseAnalyzer.getDefaultStopSet());
         namedStopWords.put("_romanian_", RomanianAnalyzer.getDefaultStopSet());
         namedStopWords.put("_russian_", RussianAnalyzer.getDefaultStopSet());
+        namedStopWords.put("_serbian_", SerbianAnalyzer.getDefaultStopSet());
         namedStopWords.put("_sorani_", SoraniAnalyzer.getDefaultStopSet());
         namedStopWords.put("_spanish_", SpanishAnalyzer.getDefaultStopSet());
         namedStopWords.put("_swedish_", SwedishAnalyzer.getDefaultStopSet());