Updated PR with latest code base.

y-scope · Jul 30, 2024 · 99d40f7 · 99d40f7
1 parent 3129017
commit 99d40f7
Show file tree

Hide file tree

Showing 8 changed files with 111 additions and 64 deletions.
diff --git a/...a/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeLuceneTextIndex.java b/...a/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeLuceneTextIndex.java
@@ -31,6 +31,7 @@
 import org.apache.lucene.search.SearcherManager;
 import org.apache.pinot.segment.local.indexsegment.mutable.MutableSegmentImpl;
 import org.apache.pinot.segment.local.segment.creator.impl.text.LuceneTextIndexCreator;
+import org.apache.pinot.segment.local.segment.store.TextIndexUtils;
 import org.apache.pinot.segment.local.utils.LuceneTextIndexUtils;
 import org.apache.pinot.segment.spi.index.TextIndexConfig;
 import org.apache.pinot.segment.spi.index.mutable.MutableTextIndex;
@@ -85,7 +86,7 @@ public RealtimeLuceneTextIndex(String column, File segmentIndexDir, String segme
       _searcherManager = new SearcherManager(indexWriter, false, false, null);
       _analyzer = _indexCreator.getIndexWriter().getConfig().getAnalyzer();
       _queryParserClassConstructor =
-              getQueryParserWithStringAndAnalyzerTypeConstructor(config.getLuceneQueryParserClass());
+          TextIndexUtils.getQueryParserWithStringAndAnalyzerTypeConstructor(config.getLuceneQueryParserClass());
       _enablePrefixSuffixMatchingInPhraseQueries = config.isEnablePrefixSuffixMatchingInPhraseQueries();
     } catch (Exception e) {
       LOGGER.error("Failed to instantiate realtime Lucene index reader for column {}, exception {}", column,
@@ -131,16 +132,20 @@ public MutableRoaringBitmap getDocIds(String searchQuery) {
         // and can be created upfront.
         QueryParserBase parser = _queryParserClassConstructor.newInstance(_column, _analyzer);
         if (_enablePrefixSuffixMatchingInPhraseQueries) {
-          // TODO: this code path is semi-broken as the default QueryParser does not always utilizes the analyzer
-          // passed into the constructor for wildcards in phrases in favor of using a custom Lucene query parser
-          // https://github.com/elastic/elasticsearch/issues/22540
+          // Note: Lucene's built-in QueryParser has limited wildcard functionality in phrase queries. It does not use
+          // the provided analyzer when wildcards are present, defaulting to the default analyzer for tokenization.
+          // Additionally, it does not support wildcards that span across terms.
+          // For more details, see: https://github.com/elastic/elasticsearch/issues/22540
+          // Workaround: Use a custom query parser that correctly implements wildcard searches.
           parser.setAllowLeadingWildcard(true);
         }
         Query query = parser.parse(searchQuery);
         if (_enablePrefixSuffixMatchingInPhraseQueries) {
-          // TODO: this code path is semi-broken as the default QueryParser does not always utilizes the analyzer
-          // passed into the constructor for wildcards in phrases in favor of using a custom Lucene query parser
-          // https://github.com/elastic/elasticsearch/issues/22540
+          // Note: Lucene's built-in QueryParser has limited wildcard functionality in phrase queries. It does not use
+          // the provided analyzer when wildcards are present, defaulting to the default analyzer for tokenization.
+          // Additionally, it does not support wildcards that span across terms.
+          // For more details, see: https://github.com/elastic/elasticsearch/issues/22540
+          // Workaround: Use a custom query parser that correctly implements wildcard searches.
           query = LuceneTextIndexUtils.convertToMultiTermSpanQuery(query);
         }
         indexSearcher = _searcherManager.acquire();
@@ -209,7 +214,7 @@ private Constructor<QueryParserBase> getQueryParserWithStringAndAnalyzerTypeCons
       throw new NoSuchMethodException("The specified lucene query parser class " + queryParserClassName
               + " is not assignable from does not have the required constructor method with parameter type "
               + "[String.class, Analyzer.class]"
-        );
+      );
     }
 
     return (Constructor<QueryParserBase>) queryParserClass.getConstructor(String.class, Analyzer.class);

diff --git a/...java/org/apache/pinot/segment/local/segment/index/readers/text/LuceneTextIndexReader.java b/...java/org/apache/pinot/segment/local/segment/index/readers/text/LuceneTextIndexReader.java
@@ -21,6 +21,7 @@
 import java.io.Closeable;
 import java.io.File;
 import java.io.IOException;
+import java.lang.reflect.Constructor;
 import java.nio.ByteOrder;
 import java.util.Map;
 import javax.annotation.Nullable;
@@ -29,6 +30,7 @@
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.queryparser.classic.QueryParser;
+import org.apache.lucene.queryparser.classic.QueryParserBase;
 import org.apache.lucene.search.Collector;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
@@ -66,6 +68,8 @@ public class LuceneTextIndexReader implements TextIndexReader {
   private final DocIdTranslator _docIdTranslator;
   private final Analyzer _analyzer;
   private boolean _useANDForMultiTermQueries = false;
+  private final String _queryParserClass;
+  private Constructor<QueryParserBase> _queryParserClassConstructor;
   private boolean _enablePrefixSuffixMatchingInPhraseQueries = false;
 
   public LuceneTextIndexReader(String column, File indexDir, int numDocs, TextIndexConfig config) {
@@ -90,6 +94,9 @@ public LuceneTextIndexReader(String column, File indexDir, int numDocs, TextInde
       // mapping file upfront on segment load v/s on-the-fly during query processing
       _docIdTranslator = new DocIdTranslator(indexDir, _column, numDocs, _indexSearcher);
       _analyzer = TextIndexUtils.getAnalyzer(config);
+      _queryParserClass = config.getLuceneQueryParserClass();
+      _queryParserClassConstructor =
+              TextIndexUtils.getQueryParserWithStringAndAnalyzerTypeConstructor(_queryParserClass);
       LOGGER.info("Successfully read lucene index for {} from {}", _column, indexDir);
     } catch (Exception e) {
       LOGGER.error("Failed to instantiate Lucene text index reader for column {}, exception {}", column,
@@ -150,17 +157,19 @@ public MutableRoaringBitmap getDocIds(String searchQuery) {
       // Lucene Query Parser is JavaCC based. It is stateful and should
       // be instantiated per query. Analyzer on the other hand is stateless
       // and can be created upfront.
-      QueryParser parser = new QueryParser(_column, _analyzer);
+      QueryParserBase parser = _queryParserClassConstructor.newInstance(_column, _analyzer);
       // Phrase search with prefix/suffix matching may have leading *. E.g., `*pache pinot` which can be stripped by
       // the query parser. To support the feature, we need to explicitly set the config to be true.
-      if (_enablePrefixSuffixMatchingInPhraseQueries) {
+      if (_queryParserClass.equals("org.apache.lucene.queryparser.classic.QueryParser")
+              && _enablePrefixSuffixMatchingInPhraseQueries) {
         parser.setAllowLeadingWildcard(true);
       }
-      if (_useANDForMultiTermQueries) {
+      if (_queryParserClass.equals("org.apache.lucene.queryparser.classic.QueryParser") && _useANDForMultiTermQueries) {
         parser.setDefaultOperator(QueryParser.Operator.AND);
       }
       Query query = parser.parse(searchQuery);
-      if (_enablePrefixSuffixMatchingInPhraseQueries) {
+      if (_queryParserClass.equals("org.apache.lucene.queryparser.classic.QueryParser")
+              && _enablePrefixSuffixMatchingInPhraseQueries) {
         query = LuceneTextIndexUtils.convertToMultiTermSpanQuery(query);
       }
       _indexSearcher.search(query, docIDCollector);

diff --git a/...c/main/java/org/apache/pinot/segment/local/segment/index/text/TextIndexConfigBuilder.java b/...c/main/java/org/apache/pinot/segment/local/segment/index/text/TextIndexConfigBuilder.java
@@ -29,6 +29,10 @@
 
 
 public class TextIndexConfigBuilder extends TextIndexConfig.AbstractBuilder {
+  public TextIndexConfigBuilder() {
+    super((FSTType) null);
+  }
+
   public TextIndexConfigBuilder(@Nullable FSTType fstType) {
     super(fstType);
   }

diff --git a/...ment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java b/...ment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
@@ -19,6 +19,7 @@
 package org.apache.pinot.segment.local.segment.store;
 
 import java.io.File;
+import java.lang.reflect.Constructor;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
@@ -31,6 +32,7 @@
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.queryparser.classic.QueryParserBase;
 import org.apache.pinot.segment.local.segment.creator.impl.text.LuceneTextIndexCreator;
 import org.apache.pinot.segment.spi.V1Constants.Indexes;
 import org.apache.pinot.segment.spi.index.TextIndexConfig;
@@ -260,4 +262,25 @@ public static StandardAnalyzer getStandardAnalyzerWithCustomizedStopWords(@Nulla
     }
     return new StandardAnalyzer(new CharArraySet(stopWordSet, true));
   }
+
+  public static Constructor<QueryParserBase> getQueryParserWithStringAndAnalyzerTypeConstructor(
+          String queryParserClassName) throws ReflectiveOperationException {
+    // Fail-fast if the query parser is specified class is not QueryParseBase class
+    final Class<?> queryParserClass = Class.forName(queryParserClassName);
+    if (!QueryParserBase.class.isAssignableFrom(queryParserClass)) {
+      throw new ReflectiveOperationException("The specified lucene query parser class " + queryParserClassName
+              + " is not assignable from " + QueryParserBase.class.getName());
+    }
+    // Fail-fast if the query parser does not have the required constructor used by this class
+    try {
+      queryParserClass.getConstructor(String.class, Analyzer.class);
+    } catch (NoSuchMethodException ex) {
+      throw new NoSuchMethodException("The specified lucene query parser class " + queryParserClassName
+              + " is not assignable from does not have the required constructor method with parameter type "
+              + "[String.class, Analyzer.class]"
+      );
+    }
+
+    return (Constructor<QueryParserBase>) queryParserClass.getConstructor(String.class, Analyzer.class);
+  }
 }
diff --git a/.../java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java b/.../java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java
@@ -53,7 +53,6 @@
 import org.apache.pinot.segment.spi.index.metadata.SegmentMetadataImpl;
 import org.apache.pinot.segment.spi.index.reader.TextIndexReader;
 import org.apache.pinot.segment.spi.store.SegmentDirectory;
-import org.apache.pinot.spi.config.table.FSTType;
 import org.apache.pinot.spi.config.table.FieldConfig;
 import org.apache.pinot.spi.config.table.IndexConfig;
 import org.apache.pinot.spi.config.table.IndexingConfig;
@@ -86,7 +85,6 @@ public class RealtimeSegmentConverterTest {
   private static final String LONG_COLUMN4 = "long_col4";
   private static final String MV_INT_COLUMN = "mv_col";
   private static final String DATE_TIME_COLUMN = "date_time_col";
-  private static final FSTType NULL_FST_TYPE = null;
 
   private static final File TMP_DIR =
       new File(FileUtils.getTempDirectory(), RealtimeSegmentConverterTest.class.getName());
@@ -475,7 +473,7 @@ public void testSegmentBuilderWithReuse(boolean columnMajorSegmentBuilder, Strin
     String tableNameWithType = tableConfig.getTableName();
     String segmentName = "testTable__0__0__123456";
     IndexingConfig indexingConfig = tableConfig.getIndexingConfig();
-    TextIndexConfig textIndexConfig = new TextIndexConfigBuilder(NULL_FST_TYPE).build();
+    TextIndexConfig textIndexConfig = new TextIndexConfigBuilder().withUseANDForMultiTermQueries(false).build();
 
     RealtimeSegmentConfig.Builder realtimeSegmentConfigBuilder =
         new RealtimeSegmentConfig.Builder().setTableNameWithType(tableNameWithType).setSegmentName(segmentName)

diff --git a/.../pinot/segment/local/realtime/impl/invertedindex/NativeAndLuceneMutableTextIndexTest.java b/.../pinot/segment/local/realtime/impl/invertedindex/NativeAndLuceneMutableTextIndexTest.java
@@ -23,6 +23,7 @@
 import java.util.List;
 import org.apache.commons.io.FileUtils;
 import org.apache.lucene.search.SearcherManager;
+import org.apache.pinot.segment.local.segment.index.text.TextIndexConfigBuilder;
 import org.apache.pinot.segment.spi.index.TextIndexConfig;
 import org.testng.annotations.AfterClass;
 import org.testng.annotations.BeforeClass;
@@ -71,8 +72,8 @@ private String[][] getMVTextData() {
   @BeforeClass
   public void setUp()
       throws Exception {
-    TextIndexConfig config =
-        new TextIndexConfig(false, null, null, false, false, null, null, true, 500, null, null, null, null, false);
+    TextIndexConfig config = new TextIndexConfigBuilder().withUseANDForMultiTermQueries(false).build();
+
     _realtimeLuceneTextIndex =
         new RealtimeLuceneTextIndex(TEXT_COLUMN_NAME, INDEX_DIR, "fooBar", config);
     _nativeMutableTextIndex = new NativeMutableTextIndex(TEXT_COLUMN_NAME);

diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java
@@ -74,8 +74,8 @@ public TextIndexConfig(@JsonProperty("disabled") Boolean disabled, @JsonProperty
     _rawValueForTextIndex = rawValueForTextIndex;
     _enableQueryCache = enableQueryCache;
     _useANDForMultiTermQueries = useANDForMultiTermQueries;
-    _stopWordsInclude = stopWordsInclude;
-    _stopWordsExclude = stopWordsExclude;
+    _stopWordsInclude = null == stopWordsInclude ? Collections.emptyList() : stopWordsInclude;
+    _stopWordsExclude = null == stopWordsExclude ? Collections.emptyList() : stopWordsExclude;
     _luceneUseCompoundFile =
         luceneUseCompoundFile == null ? LUCENE_INDEX_DEFAULT_USE_COMPOUND_FILE : luceneUseCompoundFile;
     _luceneMaxBufferSizeMB =
@@ -219,7 +219,7 @@ public TextIndexConfig build() {
       return new TextIndexConfig(false, _fstType, _rawValueForTextIndex, _enableQueryCache, _useANDForMultiTermQueries,
           _stopWordsInclude, _stopWordsExclude, _luceneUseCompoundFile, _luceneMaxBufferSizeMB, _luceneAnalyzerClass,
           CsvParser.serialize(_luceneAnalyzerClassArgs, true, false),
-          CsvParser.serialize(_luceneAnalyzerClassArgTypes, false, false),
+          CsvParser.serialize(_luceneAnalyzerClassArgTypes, true, false),
           _luceneQueryParserClass, _enablePrefixSuffixMatchingInPhraseQueries);
     }
 
@@ -230,6 +230,11 @@ public AbstractBuilder withRawValueForTextIndex(@Nullable Object rawValueForText
       return this;
     }
 
+    public AbstractBuilder withUseANDForMultiTermQueries(boolean useANDForMultiTermQueries) {
+      _useANDForMultiTermQueries = useANDForMultiTermQueries;
+      return this;
+    }
+
     public AbstractBuilder withStopWordsInclude(List<String> stopWordsInclude) {
       _stopWordsInclude = stopWordsInclude;
       return this;

diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/utils/CsvParser.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/utils/CsvParser.java
@@ -27,55 +27,57 @@
 import javax.annotation.Nullable;
 
 public class CsvParser {
-    private CsvParser() {
-        // Hide utility class default constructor
+  private CsvParser() {
+    // Hide utility class default constructor
+  }
+
+  /**
+   * Parse the input csv string with customizable parsing behavior. Sometimes the individual values may contain comma
+   * and other white space characters. These characters are sometimes expected to be part of the actual argument.
+   *
+   * @param input  string to split on comma
+   * @param escapeComma if true, we don't split on escaped commas, and we replace "\," with "," after the split
+   * @param trim   whether we should trim each tokenized terms
+   * @return a list of values, empty list if input is empty or null
+   */
+  public static List<String> parse(@Nullable String input, boolean escapeComma, boolean trim) {
+    if (null == input || input.isEmpty()) {
+      return Collections.emptyList();
     }
 
-    /**
-     * Parse the input csv string with customizable parsing behavior. Sometimes the individual values may contain comma
-     * and other white space characters. These characters are sometimes expected to be part of the actual argument.
-     *
-     * @param input  string to split on comma
-     * @param escapeComma if true, we do not split on escaped commas, and we replace "\," with "," after the split
-     * @param trim   whether we should trim each tokenized terms
-     * @return a list of values, empty list if input is empty or null
-     */
-    public static List<String> parse(@Nullable String input, boolean escapeComma, boolean trim) {
-        if (null == input || input.isEmpty()) {
-            return Collections.emptyList();
-        }
+    Stream<String> tokenStream;
+    if (escapeComma) {
+      // Use regular expression to split on "," unless it is "\,"
+      // Use a non-positive limit to apply the replacement as many times as possible and to ensure trailing empty
+      // strings shall not be discarded
+      tokenStream = Arrays.stream(input.split("(?<!\\\\),", -1))
+          .map(s -> s.replace("\\,", ","));
+    } else {
+      tokenStream = Arrays.stream(input.split(","));
+    }
 
-        Stream<String> tokenStream;
-        if (escapeComma) {
-            // Use regular expression to split on "," unless it is "\,"
-            tokenStream = Arrays.stream(input.split("(?<!\\\\),"))
-                    .map(s -> s.replace("\\,", ","));
-        } else {
-            tokenStream = Arrays.stream(input.split(","));
-        }
+    if (trim) {
+      tokenStream = tokenStream.map(String::trim);
+    }
 
-        if (trim) {
-            tokenStream = tokenStream.map(String::trim);
-        }
+    return tokenStream.collect(Collectors.toList());
+  }
 
-        return tokenStream.collect(Collectors.toList());
+  /**
+   * Parse the input list of string with customized serialization behavior.
+   * @param input containing a list of string to be serialized
+   * @param escapeComma if true, escape commas by replacing "," with "\," before the join
+   * @param trim whether we should trim each tokenized terms before serialization
+   * @return serialized string representing the input list of string
+   */
+  public static String serialize(List<String> input, boolean escapeComma, boolean trim) {
+    Stream<String> tokenStream = input.stream();
+    if (escapeComma) {
+      tokenStream = tokenStream.map(s -> s.replaceAll(",", Matcher.quoteReplacement("\\,")));
     }
-
-    /**
-     * Parse the input list of string with customized serialization behavior.
-     * @param input containing a list of string to be serialized
-     * @param escapeComma if true, escape commas by replacing "," with "\," before the join
-     * @param trim whether we should trim each tokenized terms before serialization
-     * @return serialized string representing the input list of string
-     */
-    public static String serialize(List<String> input, boolean escapeComma, boolean trim) {
-        Stream<String> tokenStream = input.stream();
-        if (escapeComma) {
-            tokenStream = tokenStream.map(s -> s.replaceAll(",", Matcher.quoteReplacement("\\,")));
-        }
-        if (trim) {
-            tokenStream = tokenStream.map(String::trim);
-        }
-        return tokenStream.collect(Collectors.joining(","));
+    if (trim) {
+      tokenStream = tokenStream.map(String::trim);
     }
+    return tokenStream.collect(Collectors.joining(","));
+  }
 }