Skip to content

Commit

Permalink
Updated PR with latest code base.
Browse files Browse the repository at this point in the history
  • Loading branch information
jackluo923 committed Jul 30, 2024
1 parent 3129017 commit 99d40f7
Show file tree
Hide file tree
Showing 8 changed files with 111 additions and 64 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import org.apache.lucene.search.SearcherManager;
import org.apache.pinot.segment.local.indexsegment.mutable.MutableSegmentImpl;
import org.apache.pinot.segment.local.segment.creator.impl.text.LuceneTextIndexCreator;
import org.apache.pinot.segment.local.segment.store.TextIndexUtils;
import org.apache.pinot.segment.local.utils.LuceneTextIndexUtils;
import org.apache.pinot.segment.spi.index.TextIndexConfig;
import org.apache.pinot.segment.spi.index.mutable.MutableTextIndex;
Expand Down Expand Up @@ -85,7 +86,7 @@ public RealtimeLuceneTextIndex(String column, File segmentIndexDir, String segme
_searcherManager = new SearcherManager(indexWriter, false, false, null);
_analyzer = _indexCreator.getIndexWriter().getConfig().getAnalyzer();
_queryParserClassConstructor =
getQueryParserWithStringAndAnalyzerTypeConstructor(config.getLuceneQueryParserClass());
TextIndexUtils.getQueryParserWithStringAndAnalyzerTypeConstructor(config.getLuceneQueryParserClass());
_enablePrefixSuffixMatchingInPhraseQueries = config.isEnablePrefixSuffixMatchingInPhraseQueries();
} catch (Exception e) {
LOGGER.error("Failed to instantiate realtime Lucene index reader for column {}, exception {}", column,
Expand Down Expand Up @@ -131,16 +132,20 @@ public MutableRoaringBitmap getDocIds(String searchQuery) {
// and can be created upfront.
QueryParserBase parser = _queryParserClassConstructor.newInstance(_column, _analyzer);
if (_enablePrefixSuffixMatchingInPhraseQueries) {
// TODO: this code path is semi-broken as the default QueryParser does not always utilizes the analyzer
// passed into the constructor for wildcards in phrases in favor of using a custom Lucene query parser
// https://github.com/elastic/elasticsearch/issues/22540
// Note: Lucene's built-in QueryParser has limited wildcard functionality in phrase queries. It does not use
// the provided analyzer when wildcards are present, defaulting to the default analyzer for tokenization.
// Additionally, it does not support wildcards that span across terms.
// For more details, see: https://github.com/elastic/elasticsearch/issues/22540
// Workaround: Use a custom query parser that correctly implements wildcard searches.
parser.setAllowLeadingWildcard(true);
}
Query query = parser.parse(searchQuery);
if (_enablePrefixSuffixMatchingInPhraseQueries) {
// TODO: this code path is semi-broken as the default QueryParser does not always utilizes the analyzer
// passed into the constructor for wildcards in phrases in favor of using a custom Lucene query parser
// https://github.com/elastic/elasticsearch/issues/22540
// Note: Lucene's built-in QueryParser has limited wildcard functionality in phrase queries. It does not use
// the provided analyzer when wildcards are present, defaulting to the default analyzer for tokenization.
// Additionally, it does not support wildcards that span across terms.
// For more details, see: https://github.com/elastic/elasticsearch/issues/22540
// Workaround: Use a custom query parser that correctly implements wildcard searches.
query = LuceneTextIndexUtils.convertToMultiTermSpanQuery(query);
}
indexSearcher = _searcherManager.acquire();
Expand Down Expand Up @@ -209,7 +214,7 @@ private Constructor<QueryParserBase> getQueryParserWithStringAndAnalyzerTypeCons
throw new NoSuchMethodException("The specified lucene query parser class " + queryParserClassName
+ " is not assignable from does not have the required constructor method with parameter type "
+ "[String.class, Analyzer.class]"
);
);
}

return (Constructor<QueryParserBase>) queryParserClass.getConstructor(String.class, Analyzer.class);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.nio.ByteOrder;
import java.util.Map;
import javax.annotation.Nullable;
Expand All @@ -29,6 +30,7 @@
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.queryparser.classic.QueryParserBase;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
Expand Down Expand Up @@ -66,6 +68,8 @@ public class LuceneTextIndexReader implements TextIndexReader {
private final DocIdTranslator _docIdTranslator;
private final Analyzer _analyzer;
private boolean _useANDForMultiTermQueries = false;
private final String _queryParserClass;
private Constructor<QueryParserBase> _queryParserClassConstructor;
private boolean _enablePrefixSuffixMatchingInPhraseQueries = false;

public LuceneTextIndexReader(String column, File indexDir, int numDocs, TextIndexConfig config) {
Expand All @@ -90,6 +94,9 @@ public LuceneTextIndexReader(String column, File indexDir, int numDocs, TextInde
// mapping file upfront on segment load v/s on-the-fly during query processing
_docIdTranslator = new DocIdTranslator(indexDir, _column, numDocs, _indexSearcher);
_analyzer = TextIndexUtils.getAnalyzer(config);
_queryParserClass = config.getLuceneQueryParserClass();
_queryParserClassConstructor =
TextIndexUtils.getQueryParserWithStringAndAnalyzerTypeConstructor(_queryParserClass);
LOGGER.info("Successfully read lucene index for {} from {}", _column, indexDir);
} catch (Exception e) {
LOGGER.error("Failed to instantiate Lucene text index reader for column {}, exception {}", column,
Expand Down Expand Up @@ -150,17 +157,19 @@ public MutableRoaringBitmap getDocIds(String searchQuery) {
// Lucene Query Parser is JavaCC based. It is stateful and should
// be instantiated per query. Analyzer on the other hand is stateless
// and can be created upfront.
QueryParser parser = new QueryParser(_column, _analyzer);
QueryParserBase parser = _queryParserClassConstructor.newInstance(_column, _analyzer);
// Phrase search with prefix/suffix matching may have leading *. E.g., `*pache pinot` which can be stripped by
// the query parser. To support the feature, we need to explicitly set the config to be true.
if (_enablePrefixSuffixMatchingInPhraseQueries) {
if (_queryParserClass.equals("org.apache.lucene.queryparser.classic.QueryParser")
&& _enablePrefixSuffixMatchingInPhraseQueries) {
parser.setAllowLeadingWildcard(true);
}
if (_useANDForMultiTermQueries) {
if (_queryParserClass.equals("org.apache.lucene.queryparser.classic.QueryParser") && _useANDForMultiTermQueries) {
parser.setDefaultOperator(QueryParser.Operator.AND);
}
Query query = parser.parse(searchQuery);
if (_enablePrefixSuffixMatchingInPhraseQueries) {
if (_queryParserClass.equals("org.apache.lucene.queryparser.classic.QueryParser")
&& _enablePrefixSuffixMatchingInPhraseQueries) {
query = LuceneTextIndexUtils.convertToMultiTermSpanQuery(query);
}
_indexSearcher.search(query, docIDCollector);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@


public class TextIndexConfigBuilder extends TextIndexConfig.AbstractBuilder {
public TextIndexConfigBuilder() {
super((FSTType) null);
}

public TextIndexConfigBuilder(@Nullable FSTType fstType) {
super(fstType);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package org.apache.pinot.segment.local.segment.store;

import java.io.File;
import java.lang.reflect.Constructor;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
Expand All @@ -31,6 +32,7 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.queryparser.classic.QueryParserBase;
import org.apache.pinot.segment.local.segment.creator.impl.text.LuceneTextIndexCreator;
import org.apache.pinot.segment.spi.V1Constants.Indexes;
import org.apache.pinot.segment.spi.index.TextIndexConfig;
Expand Down Expand Up @@ -260,4 +262,25 @@ public static StandardAnalyzer getStandardAnalyzerWithCustomizedStopWords(@Nulla
}
return new StandardAnalyzer(new CharArraySet(stopWordSet, true));
}

public static Constructor<QueryParserBase> getQueryParserWithStringAndAnalyzerTypeConstructor(
String queryParserClassName) throws ReflectiveOperationException {
// Fail-fast if the query parser is specified class is not QueryParseBase class
final Class<?> queryParserClass = Class.forName(queryParserClassName);
if (!QueryParserBase.class.isAssignableFrom(queryParserClass)) {
throw new ReflectiveOperationException("The specified lucene query parser class " + queryParserClassName
+ " is not assignable from " + QueryParserBase.class.getName());
}
// Fail-fast if the query parser does not have the required constructor used by this class
try {
queryParserClass.getConstructor(String.class, Analyzer.class);
} catch (NoSuchMethodException ex) {
throw new NoSuchMethodException("The specified lucene query parser class " + queryParserClassName
+ " is not assignable from does not have the required constructor method with parameter type "
+ "[String.class, Analyzer.class]"
);
}

return (Constructor<QueryParserBase>) queryParserClass.getConstructor(String.class, Analyzer.class);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@
import org.apache.pinot.segment.spi.index.metadata.SegmentMetadataImpl;
import org.apache.pinot.segment.spi.index.reader.TextIndexReader;
import org.apache.pinot.segment.spi.store.SegmentDirectory;
import org.apache.pinot.spi.config.table.FSTType;
import org.apache.pinot.spi.config.table.FieldConfig;
import org.apache.pinot.spi.config.table.IndexConfig;
import org.apache.pinot.spi.config.table.IndexingConfig;
Expand Down Expand Up @@ -86,7 +85,6 @@ public class RealtimeSegmentConverterTest {
private static final String LONG_COLUMN4 = "long_col4";
private static final String MV_INT_COLUMN = "mv_col";
private static final String DATE_TIME_COLUMN = "date_time_col";
private static final FSTType NULL_FST_TYPE = null;

private static final File TMP_DIR =
new File(FileUtils.getTempDirectory(), RealtimeSegmentConverterTest.class.getName());
Expand Down Expand Up @@ -475,7 +473,7 @@ public void testSegmentBuilderWithReuse(boolean columnMajorSegmentBuilder, Strin
String tableNameWithType = tableConfig.getTableName();
String segmentName = "testTable__0__0__123456";
IndexingConfig indexingConfig = tableConfig.getIndexingConfig();
TextIndexConfig textIndexConfig = new TextIndexConfigBuilder(NULL_FST_TYPE).build();
TextIndexConfig textIndexConfig = new TextIndexConfigBuilder().withUseANDForMultiTermQueries(false).build();

RealtimeSegmentConfig.Builder realtimeSegmentConfigBuilder =
new RealtimeSegmentConfig.Builder().setTableNameWithType(tableNameWithType).setSegmentName(segmentName)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.search.SearcherManager;
import org.apache.pinot.segment.local.segment.index.text.TextIndexConfigBuilder;
import org.apache.pinot.segment.spi.index.TextIndexConfig;
import org.testng.annotations.AfterClass;
import org.testng.annotations.BeforeClass;
Expand Down Expand Up @@ -71,8 +72,8 @@ private String[][] getMVTextData() {
@BeforeClass
public void setUp()
throws Exception {
TextIndexConfig config =
new TextIndexConfig(false, null, null, false, false, null, null, true, 500, null, null, null, null, false);
TextIndexConfig config = new TextIndexConfigBuilder().withUseANDForMultiTermQueries(false).build();

_realtimeLuceneTextIndex =
new RealtimeLuceneTextIndex(TEXT_COLUMN_NAME, INDEX_DIR, "fooBar", config);
_nativeMutableTextIndex = new NativeMutableTextIndex(TEXT_COLUMN_NAME);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ public TextIndexConfig(@JsonProperty("disabled") Boolean disabled, @JsonProperty
_rawValueForTextIndex = rawValueForTextIndex;
_enableQueryCache = enableQueryCache;
_useANDForMultiTermQueries = useANDForMultiTermQueries;
_stopWordsInclude = stopWordsInclude;
_stopWordsExclude = stopWordsExclude;
_stopWordsInclude = null == stopWordsInclude ? Collections.emptyList() : stopWordsInclude;
_stopWordsExclude = null == stopWordsExclude ? Collections.emptyList() : stopWordsExclude;
_luceneUseCompoundFile =
luceneUseCompoundFile == null ? LUCENE_INDEX_DEFAULT_USE_COMPOUND_FILE : luceneUseCompoundFile;
_luceneMaxBufferSizeMB =
Expand Down Expand Up @@ -219,7 +219,7 @@ public TextIndexConfig build() {
return new TextIndexConfig(false, _fstType, _rawValueForTextIndex, _enableQueryCache, _useANDForMultiTermQueries,
_stopWordsInclude, _stopWordsExclude, _luceneUseCompoundFile, _luceneMaxBufferSizeMB, _luceneAnalyzerClass,
CsvParser.serialize(_luceneAnalyzerClassArgs, true, false),
CsvParser.serialize(_luceneAnalyzerClassArgTypes, false, false),
CsvParser.serialize(_luceneAnalyzerClassArgTypes, true, false),
_luceneQueryParserClass, _enablePrefixSuffixMatchingInPhraseQueries);
}

Expand All @@ -230,6 +230,11 @@ public AbstractBuilder withRawValueForTextIndex(@Nullable Object rawValueForText
return this;
}

public AbstractBuilder withUseANDForMultiTermQueries(boolean useANDForMultiTermQueries) {
_useANDForMultiTermQueries = useANDForMultiTermQueries;
return this;
}

public AbstractBuilder withStopWordsInclude(List<String> stopWordsInclude) {
_stopWordsInclude = stopWordsInclude;
return this;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,55 +27,57 @@
import javax.annotation.Nullable;

public class CsvParser {
private CsvParser() {
// Hide utility class default constructor
private CsvParser() {
// Hide utility class default constructor
}

/**
* Parse the input csv string with customizable parsing behavior. Sometimes the individual values may contain comma
* and other white space characters. These characters are sometimes expected to be part of the actual argument.
*
* @param input string to split on comma
* @param escapeComma if true, we don't split on escaped commas, and we replace "\," with "," after the split
* @param trim whether we should trim each tokenized terms
* @return a list of values, empty list if input is empty or null
*/
public static List<String> parse(@Nullable String input, boolean escapeComma, boolean trim) {
if (null == input || input.isEmpty()) {
return Collections.emptyList();
}

/**
* Parse the input csv string with customizable parsing behavior. Sometimes the individual values may contain comma
* and other white space characters. These characters are sometimes expected to be part of the actual argument.
*
* @param input string to split on comma
* @param escapeComma if true, we do not split on escaped commas, and we replace "\," with "," after the split
* @param trim whether we should trim each tokenized terms
* @return a list of values, empty list if input is empty or null
*/
public static List<String> parse(@Nullable String input, boolean escapeComma, boolean trim) {
if (null == input || input.isEmpty()) {
return Collections.emptyList();
}
Stream<String> tokenStream;
if (escapeComma) {
// Use regular expression to split on "," unless it is "\,"
// Use a non-positive limit to apply the replacement as many times as possible and to ensure trailing empty
// strings shall not be discarded
tokenStream = Arrays.stream(input.split("(?<!\\\\),", -1))
.map(s -> s.replace("\\,", ","));
} else {
tokenStream = Arrays.stream(input.split(","));
}

Stream<String> tokenStream;
if (escapeComma) {
// Use regular expression to split on "," unless it is "\,"
tokenStream = Arrays.stream(input.split("(?<!\\\\),"))
.map(s -> s.replace("\\,", ","));
} else {
tokenStream = Arrays.stream(input.split(","));
}
if (trim) {
tokenStream = tokenStream.map(String::trim);
}

if (trim) {
tokenStream = tokenStream.map(String::trim);
}
return tokenStream.collect(Collectors.toList());
}

return tokenStream.collect(Collectors.toList());
/**
* Parse the input list of string with customized serialization behavior.
* @param input containing a list of string to be serialized
* @param escapeComma if true, escape commas by replacing "," with "\," before the join
* @param trim whether we should trim each tokenized terms before serialization
* @return serialized string representing the input list of string
*/
public static String serialize(List<String> input, boolean escapeComma, boolean trim) {
Stream<String> tokenStream = input.stream();
if (escapeComma) {
tokenStream = tokenStream.map(s -> s.replaceAll(",", Matcher.quoteReplacement("\\,")));
}

/**
* Parse the input list of string with customized serialization behavior.
* @param input containing a list of string to be serialized
* @param escapeComma if true, escape commas by replacing "," with "\," before the join
* @param trim whether we should trim each tokenized terms before serialization
* @return serialized string representing the input list of string
*/
public static String serialize(List<String> input, boolean escapeComma, boolean trim) {
Stream<String> tokenStream = input.stream();
if (escapeComma) {
tokenStream = tokenStream.map(s -> s.replaceAll(",", Matcher.quoteReplacement("\\,")));
}
if (trim) {
tokenStream = tokenStream.map(String::trim);
}
return tokenStream.collect(Collectors.joining(","));
if (trim) {
tokenStream = tokenStream.map(String::trim);
}
return tokenStream.collect(Collectors.joining(","));
}
}

0 comments on commit 99d40f7

Please sign in to comment.