Skip to content

Commit

Permalink
Introduce Serbian analyzer
Browse files Browse the repository at this point in the history
  • Loading branch information
velizarvel committed Jun 27, 2024
1 parent 0eb39ae commit 4b41c2a
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@
import org.apache.lucene.analysis.ru.RussianAnalyzer;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.sr.SerbianAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.sv.SwedishAnalyzer;
import org.apache.lucene.analysis.th.ThaiAnalyzer;
Expand Down Expand Up @@ -237,6 +238,7 @@ public Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAn
analyzers.put("portuguese", PortugueseAnalyzerProvider::new);
analyzers.put("romanian", RomanianAnalyzerProvider::new);
analyzers.put("russian", RussianAnalyzerProvider::new);
analyzers.put("serbian", SerbianAnalyzerProvider::new);
analyzers.put("sorani", SoraniAnalyzerProvider::new);
analyzers.put("spanish", SpanishAnalyzerProvider::new);
analyzers.put("swedish", SwedishAnalyzerProvider::new);
Expand Down Expand Up @@ -469,6 +471,7 @@ public List<PreBuiltAnalyzerProviderFactory> getPreBuiltAnalyzerProviderFactorie
analyzers.add(new PreBuiltAnalyzerProviderFactory("portuguese", CachingStrategy.LUCENE, PortugueseAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("romanian", CachingStrategy.LUCENE, RomanianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("russian", CachingStrategy.LUCENE, RussianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("serbian", CachingStrategy.LUCENE, SerbianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("sorani", CachingStrategy.LUCENE, SoraniAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("spanish", CachingStrategy.LUCENE, SpanishAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("swedish", CachingStrategy.LUCENE, SwedishAnalyzer::new));
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

/*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/

package org.opensearch.analysis.common;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.sr.SerbianAnalyzer;
import org.opensearch.common.settings.Settings;
import org.opensearch.env.Environment;
import org.opensearch.index.IndexSettings;
import org.opensearch.index.analysis.AbstractIndexAnalyzerProvider;
import org.opensearch.index.analysis.Analysis;

public class SerbianAnalyzerProvider extends AbstractIndexAnalyzerProvider<SerbianAnalyzer> {

private final SerbianAnalyzer analyzer;

SerbianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {

super(indexSettings, name, settings);
analyzer = new SerbianAnalyzer(
Analysis.parseStopWords(env, settings, SerbianAnalyzer.getDefaultStopSet()),
Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
);
}

@Override
public SerbianAnalyzer get() {

return this.analyzer;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@
import org.tartarus.snowball.ext.PortugueseStemmer;
import org.tartarus.snowball.ext.RomanianStemmer;
import org.tartarus.snowball.ext.RussianStemmer;
import org.tartarus.snowball.ext.SerbianStemmer;
import org.tartarus.snowball.ext.SpanishStemmer;
import org.tartarus.snowball.ext.SwedishStemmer;
import org.tartarus.snowball.ext.TurkishStemmer;
Expand Down Expand Up @@ -257,7 +258,10 @@ public TokenStream create(TokenStream tokenStream) {
} else if ("russian".equalsIgnoreCase(language)) {
return new SnowballFilter(tokenStream, new RussianStemmer());
} else if ("light_russian".equalsIgnoreCase(language) || "lightRussian".equalsIgnoreCase(language)) {
return new RussianLightStemFilter(tokenStream);
return new RussianLightStemFilter(tokenStream);

} else if ("serbian".equalsIgnoreCase(language)) {
return new SnowballFilter(tokenStream, new SerbianStemmer());

// Spanish stemmers
} else if ("spanish".equalsIgnoreCase(language)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -988,6 +988,35 @@
- length: { tokens: 1 }
- match: { tokens.0.token: вмест }

---
"serbian":
- do:
indices.create:
index: test
body:
settings:
analysis:
analyzer:
my_analyzer:
type: serbian

- do:
indices.analyze:
body:
text: можемо заједно
analyzer: serbian
- length: { tokens: 1 }
- match: { tokens.0.token: zajedn }

- do:
indices.analyze:
index: test
body:
text: можемо заједно
analyzer: my_analyzer
- length: { tokens: 1 }
- match: { tokens.0.token: zajedn }

---
"sorani":
- do:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
import org.apache.lucene.analysis.pt.PortugueseAnalyzer;
import org.apache.lucene.analysis.ro.RomanianAnalyzer;
import org.apache.lucene.analysis.ru.RussianAnalyzer;
import org.apache.lucene.analysis.sr.SerbianAnalyzer;
import org.apache.lucene.analysis.sv.SwedishAnalyzer;
import org.apache.lucene.analysis.th.ThaiAnalyzer;
import org.apache.lucene.analysis.tr.TurkishAnalyzer;
Expand Down Expand Up @@ -148,6 +149,7 @@ public static CharArraySet parseStemExclusion(Settings settings, CharArraySet de
namedStopWords.put("_portuguese_", PortugueseAnalyzer.getDefaultStopSet());
namedStopWords.put("_romanian_", RomanianAnalyzer.getDefaultStopSet());
namedStopWords.put("_russian_", RussianAnalyzer.getDefaultStopSet());
namedStopWords.put("_serbian_", SerbianAnalyzer.getDefaultStopSet());
namedStopWords.put("_sorani_", SoraniAnalyzer.getDefaultStopSet());
namedStopWords.put("_spanish_", SpanishAnalyzer.getDefaultStopSet());
namedStopWords.put("_swedish_", SwedishAnalyzer.getDefaultStopSet());
Expand Down

0 comments on commit 4b41c2a

Please sign in to comment.