Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport 2.x] Add a new configuration setting synonym_analyzer for synonym_graph and synonym #16625

Merged
merged 1 commit into from
Nov 13, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -12,6 +12,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Add dynamic setting allowing size > 0 requests to be cached in the request cache ([#16483](https://github.com/opensearch-project/OpenSearch/pull/16483))
- Make IndexStoreListener a pluggable interface ([#16583](https://github.com/opensearch-project/OpenSearch/pull/16583))
- Add vertical scaling and SoftReference for snapshot repository data cache ([#16489](https://github.com/opensearch-project/OpenSearch/pull/16489))
- Add new configuration setting `synonym_analyzer`, to the `synonym` and `synonym_graph` filters, enabling the specification of a custom analyzer for reading the synonym file ([#16488](https://github.com/opensearch-project/OpenSearch/pull/16488)).

### Dependencies
- Bump `com.google.apis:google-api-services-compute` from v1-rev20240407-2.0.0 to v1-rev20241021-2.0.0 ([#16502](https://github.com/opensearch-project/OpenSearch/pull/16502), [#16548](https://github.com/opensearch-project/OpenSearch/pull/16548))
Original file line number Diff line number Diff line change
@@ -146,6 +146,7 @@
import org.opensearch.index.analysis.PreConfiguredTokenizer;
import org.opensearch.index.analysis.TokenFilterFactory;
import org.opensearch.index.analysis.TokenizerFactory;
import org.opensearch.indices.analysis.AnalysisModule;
import org.opensearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.opensearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
import org.opensearch.plugins.AnalysisPlugin;
@@ -247,7 +248,7 @@ public Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAn
}

@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters(AnalysisModule analysisModule) {
Map<String, AnalysisProvider<TokenFilterFactory>> filters = new TreeMap<>();
filters.put("apostrophe", ApostropheFilterFactory::new);
filters.put("arabic_normalization", ArabicNormalizationFilterFactory::new);
@@ -325,14 +326,36 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
filters.put("sorani_normalization", SoraniNormalizationFilterFactory::new);
filters.put("stemmer_override", requiresAnalysisSettings(StemmerOverrideTokenFilterFactory::new));
filters.put("stemmer", StemmerTokenFilterFactory::new);
filters.put("synonym", requiresAnalysisSettings(SynonymTokenFilterFactory::new));
filters.put("synonym_graph", requiresAnalysisSettings(SynonymGraphTokenFilterFactory::new));
filters.put("trim", TrimTokenFilterFactory::new);
filters.put("truncate", requiresAnalysisSettings(TruncateTokenFilterFactory::new));
filters.put("unique", UniqueTokenFilterFactory::new);
filters.put("uppercase", UpperCaseTokenFilterFactory::new);
filters.put("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new);
filters.put("word_delimiter", WordDelimiterTokenFilterFactory::new);
filters.put(
"synonym",
requiresAnalysisSettings(
(indexSettings, environment, name, settings) -> new SynonymTokenFilterFactory(
indexSettings,
environment,
name,
settings,
analysisModule.getAnalysisRegistry()
)
)
);
filters.put(
"synonym_graph",
requiresAnalysisSettings(
(indexSettings, environment, name, settings) -> new SynonymGraphTokenFilterFactory(
indexSettings,
environment,
name,
settings,
analysisModule.getAnalysisRegistry()
)
)
);
return filters;
}

Original file line number Diff line number Diff line change
@@ -40,6 +40,7 @@
import org.opensearch.env.Environment;
import org.opensearch.index.IndexSettings;
import org.opensearch.index.analysis.AnalysisMode;
import org.opensearch.index.analysis.AnalysisRegistry;
import org.opensearch.index.analysis.CharFilterFactory;
import org.opensearch.index.analysis.TokenFilterFactory;
import org.opensearch.index.analysis.TokenizerFactory;
@@ -49,8 +50,14 @@

public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory {

SynonymGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, env, name, settings);
SynonymGraphTokenFilterFactory(
IndexSettings indexSettings,
Environment env,
String name,
Settings settings,
AnalysisRegistry analysisRegistry
) {
super(indexSettings, env, name, settings, analysisRegistry);
}

@Override
Original file line number Diff line number Diff line change
@@ -44,11 +44,13 @@
import org.opensearch.index.analysis.AbstractTokenFilterFactory;
import org.opensearch.index.analysis.Analysis;
import org.opensearch.index.analysis.AnalysisMode;
import org.opensearch.index.analysis.AnalysisRegistry;
import org.opensearch.index.analysis.CharFilterFactory;
import org.opensearch.index.analysis.CustomAnalyzer;
import org.opensearch.index.analysis.TokenFilterFactory;
import org.opensearch.index.analysis.TokenizerFactory;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.List;
@@ -64,8 +66,16 @@
protected final Settings settings;
protected final Environment environment;
protected final AnalysisMode analysisMode;

SynonymTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
private final String synonymAnalyzerName;
private final AnalysisRegistry analysisRegistry;

SynonymTokenFilterFactory(
IndexSettings indexSettings,
Environment env,
String name,
Settings settings,
AnalysisRegistry analysisRegistry
) {
super(indexSettings, name, settings);
this.settings = settings;

@@ -83,6 +93,8 @@
boolean updateable = settings.getAsBoolean("updateable", false);
this.analysisMode = updateable ? AnalysisMode.SEARCH_TIME : AnalysisMode.ALL;
this.environment = env;
this.synonymAnalyzerName = settings.get("synonym_analyzer", null);
this.analysisRegistry = analysisRegistry;
}

@Override
@@ -137,6 +149,17 @@
List<TokenFilterFactory> tokenFilters,
Function<String, TokenFilterFactory> allFilters
) {
if (synonymAnalyzerName != null) {
Analyzer customSynonymAnalyzer;
try {
customSynonymAnalyzer = analysisRegistry.getAnalyzer(synonymAnalyzerName);
} catch (IOException e) {
throw new RuntimeException(e);

Check warning on line 157 in modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymTokenFilterFactory.java

Codecov / codecov/patch

modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymTokenFilterFactory.java#L156-L157

Added lines #L156 - L157 were not covered by tests
}
if (customSynonymAnalyzer != null) {
return customSynonymAnalyzer;
}
}
return new CustomAnalyzer(
tokenizer,
charFilters.toArray(new CharFilterFactory[0]),
Original file line number Diff line number Diff line change
@@ -39,12 +39,16 @@
import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
import org.apache.lucene.analysis.te.TeluguNormalizationFilterFactory;
import org.apache.lucene.analysis.te.TeluguStemFilterFactory;
import org.opensearch.index.analysis.TokenFilterFactory;
import org.opensearch.indices.analysis.AnalysisFactoryTestCase;
import org.opensearch.indices.analysis.AnalysisModule;

import java.util.List;
import java.util.Map;
import java.util.TreeMap;

import org.mockito.Mock;

import static java.util.Collections.emptyList;
import static java.util.stream.Collectors.toList;

@@ -53,6 +57,9 @@ public CommonAnalysisFactoryTests() {
super(new CommonAnalysisPlugin());
}

@Mock
private AnalysisModule analysisModule;

@Override
protected Map<String, Class<?>> getTokenizers() {
Map<String, Class<?>> tokenizers = new TreeMap<>(super.getTokenizers());
@@ -302,4 +309,19 @@ private void markedTestCase(String name, Map<String, Class<?>> map) {
unmarked
);
}

/**
* Tests the getTokenFilters(AnalysisModule) method to verify:
* 1. All token filters are properly loaded
* 2. Basic filters remain available
* 3. Synonym filters remain available when AnalysisModule is provided
*/
public void testGetTokenFiltersWithAnalysisModule() {
CommonAnalysisPlugin plugin = (CommonAnalysisPlugin) getAnalysisPlugin();
Map<String, AnalysisModule.AnalysisProvider<TokenFilterFactory>> filters = plugin.getTokenFilters(analysisModule);
assertNotNull("Token filters should not be null", filters);
assertTrue("Should contain basic filters", filters.containsKey("lowercase"));
assertTrue("Should contain synonym filter", filters.containsKey("synonym"));
assertTrue("Should contain synonym_graph filter", filters.containsKey("synonym_graph"));
}
}
Original file line number Diff line number Diff line change
@@ -42,11 +42,14 @@
import org.opensearch.cluster.metadata.IndexMetadata;
import org.opensearch.common.settings.Settings;
import org.opensearch.env.Environment;
import org.opensearch.env.TestEnvironment;
import org.opensearch.index.IndexSettings;
import org.opensearch.index.analysis.AnalysisRegistry;
import org.opensearch.index.analysis.IndexAnalyzers;
import org.opensearch.index.analysis.PreConfiguredTokenFilter;
import org.opensearch.index.analysis.TokenFilterFactory;
import org.opensearch.index.analysis.TokenizerFactory;
import org.opensearch.indices.analysis.AnalysisModule;
import org.opensearch.test.IndexSettingsModule;
import org.opensearch.test.OpenSearchTestCase;
import org.opensearch.test.VersionUtils;
@@ -64,6 +67,7 @@
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.Matchers.startsWith;
import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class SynonymsAnalysisTests extends OpenSearchTestCase {
private IndexAnalyzers indexAnalyzers;
@@ -259,14 +263,17 @@ public void testTokenFiltersBypassSynonymAnalysis() throws IOException {
.put("hyphenation_patterns_path", "foo")
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
Environment environment = TestEnvironment.newEnvironment(settings);
AnalysisModule analysisModule = new AnalysisModule(environment, Collections.singletonList(new CommonAnalysisPlugin()));
AnalysisRegistry analysisRegistry = analysisModule.getAnalysisRegistry();

String[] bypassingFactories = new String[] { "dictionary_decompounder" };

CommonAnalysisPlugin plugin = new CommonAnalysisPlugin();
for (String factory : bypassingFactories) {
TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings);
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings);
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings);
TokenFilterFactory tff = plugin.getTokenFilters(analysisModule).get(factory).get(idxSettings, environment, factory, settings);
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, environment, "keyword", settings);
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, environment, "synonym", settings, analysisRegistry);
Analyzer analyzer = stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null);

try (TokenStream ts = analyzer.tokenStream("field", "text")) {
@@ -329,7 +336,10 @@ public void testDisallowedTokenFilters() throws IOException {
.putList("common_words", "a", "b")
.put("output_unigrams", "true")
.build();
Environment environment = TestEnvironment.newEnvironment(settings);
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
AnalysisModule analysisModule = new AnalysisModule(environment, Collections.singletonList(new CommonAnalysisPlugin()));
AnalysisRegistry analysisRegistry = analysisModule.getAnalysisRegistry();
CommonAnalysisPlugin plugin = new CommonAnalysisPlugin();

String[] disallowedFactories = new String[] {
@@ -343,9 +353,9 @@ public void testDisallowedTokenFilters() throws IOException {
"fingerprint" };

for (String factory : disallowedFactories) {
TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings);
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings);
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings);
TokenFilterFactory tff = plugin.getTokenFilters(analysisModule).get(factory).get(idxSettings, environment, factory, settings);
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, environment, "keyword", settings);
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, environment, "synonym", settings, analysisRegistry);

IllegalArgumentException e = expectThrows(
IllegalArgumentException.class,
@@ -372,4 +382,76 @@ private void match(String analyzerName, String source, String target) throws IOE
MatcherAssert.assertThat(target, equalTo(sb.toString().trim()));
}

/**
* Tests the integration of word delimiter and synonym graph filters with synonym_analyzer based on issue #16263.
* This test verifies the correct handling of:
* 1. Hyphenated words with word delimiter (e.g., "note-book" → ["notebook", "note", "book"])
* 2. Multi-word synonyms (e.g., "mobile phone" → ["smartphone"])
* 3. Single word synonyms (e.g., "laptop" → ["notebook"])
*
* @see <a href="https://github.com/opensearch-project/OpenSearch/issues/16263">Issue #16263</a>
*/
public void testSynonymAnalyzerWithWordDelimiter() throws IOException {
Settings settings = Settings.builder()
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)
.put("path.home", createTempDir().toString())
.put("index.analysis.filter.custom_word_delimiter.type", "word_delimiter_graph")
.put("index.analysis.filter.custom_word_delimiter.generate_word_parts", true)
.put("index.analysis.filter.custom_word_delimiter.catenate_all", true)
.put("index.analysis.filter.custom_word_delimiter.split_on_numerics", false)
.put("index.analysis.filter.custom_word_delimiter.split_on_case_change", false)
.put("index.analysis.filter.custom_pattern_replace_filter.type", "pattern_replace")
.put("index.analysis.filter.custom_pattern_replace_filter.pattern", "(-)")
.put("index.analysis.filter.custom_pattern_replace_filter.replacement", " ")
.put("index.analysis.filter.custom_pattern_replace_filter.all", true)
.put("index.analysis.filter.custom_synonym_graph_filter.type", "synonym_graph")
.putList(
"index.analysis.filter.custom_synonym_graph_filter.synonyms",
"laptop => notebook",
"smartphone, mobile phone, cell phone => smartphone",
"tv, television => television"
)
.put("index.analysis.filter.custom_synonym_graph_filter.synonym_analyzer", "standard")
.put("index.analysis.analyzer.text_en_index.type", "custom")
.put("index.analysis.analyzer.text_en_index.tokenizer", "whitespace")
.putList(
"index.analysis.analyzer.text_en_index.filter",
"lowercase",
"custom_word_delimiter",
"custom_synonym_graph_filter",
"custom_pattern_replace_filter",
"flatten_graph"
)
.build();
Environment environment = TestEnvironment.newEnvironment(settings);
IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("test", settings);
AnalysisModule module = new AnalysisModule(environment, Collections.singletonList(new CommonAnalysisPlugin()));
IndexAnalyzers analyzers = module.getAnalysisRegistry().build(indexSettings);
try (TokenStream ts = analyzers.get("text_en_index").tokenStream("", "note-book")) {
assertTokenStreamContents(
ts,
new String[] { "notebook", "note", "book" },
new int[] { 0, 0, 5 },
new int[] { 9, 4, 9 },
new String[] { "word", "word", "word" },
new int[] { 1, 0, 1 },
new int[] { 2, 1, 1 }
);
}
try (TokenStream ts = analyzers.get("text_en_index").tokenStream("", "mobile phone")) {
assertTokenStreamContents(
ts,
new String[] { "smartphone" },
new int[] { 0 },
new int[] { 12 },
new String[] { "SYNONYM" },
new int[] { 1 },
new int[] { 1 }
);
}
try (TokenStream ts = analyzers.get("text_en_index").tokenStream("", "laptop")) {
assertTokenStreamContents(ts, new String[] { "notebook" }, new int[] { 0 }, new int[] { 6 });
}
}

}
Original file line number Diff line number Diff line change
@@ -181,7 +181,12 @@ public boolean requiresAnalysisSettings() {
)
);

tokenFilters.extractAndRegister(plugins, AnalysisPlugin::getTokenFilters);
for (AnalysisPlugin plugin : plugins) {
Map<String, AnalysisProvider<TokenFilterFactory>> filters = plugin.getTokenFilters(this);
for (Map.Entry<String, AnalysisProvider<TokenFilterFactory>> entry : filters.entrySet()) {
tokenFilters.register(entry.getKey(), entry.getValue());
}
}
return tokenFilters;
}

Original file line number Diff line number Diff line change
@@ -47,6 +47,7 @@
import org.opensearch.index.analysis.PreConfiguredTokenizer;
import org.opensearch.index.analysis.TokenFilterFactory;
import org.opensearch.index.analysis.TokenizerFactory;
import org.opensearch.indices.analysis.AnalysisModule;
import org.opensearch.indices.analysis.AnalysisModule.AnalysisProvider;

import java.io.IOException;
@@ -92,6 +93,14 @@ default Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
return emptyMap();
}

/**
* Override to add additional {@link TokenFilter}s that need access to the AnalysisModule.
* The default implementation for plugins that don't need AnalysisModule calls the existing getTokenFilters() method.
*/
default Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters(AnalysisModule analysisModule) {
return getTokenFilters();
}

/**
* Override to add additional {@link Tokenizer}s. See {@link #requiresAnalysisSettings(AnalysisProvider)}
* how to on get the configuration from the index.
Loading