Merge pull request #178 from chris-bamford/luke-4.10.4-field-reconstr…

…uction-2 Issue #175: Offered improvement to reconstruction of unstored fields …
DmitryKey · Dec 16, 2019 · 263ab99 · 263ab99
2 parents 13f7243 + d377c7f
commit 263ab99
Show file tree

Hide file tree

Showing 3 changed files with 151 additions and 40 deletions.
diff --git a/pom.xml b/pom.xml
@@ -6,7 +6,7 @@
 
     <groupId>luke</groupId>
     <artifactId>luke</artifactId>
-    <version>${lucene.version}</version>
+    <version>4.10.4</version>
 
     <properties>
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
@@ -20,6 +20,7 @@
         <maven-shade-plugin.version>2.3</maven-shade-plugin.version>
         <maven-compiler-plugin.version>3.1</maven-compiler-plugin.version>
         <maven-dependency-plugin.version>2.8</maven-dependency-plugin.version>
+        <maven-assembly-plugin.version>2.2</maven-assembly-plugin.version>
         <maven-jar-plugin.version>2.4</maven-jar-plugin.version>
         <maven-source-plugin.version>2.2.1</maven-source-plugin.version>
         <exec-maven-plugin.version>1.2.1</exec-maven-plugin.version>

diff --git a/src/main/java/org/getopt/luke/DocReconstructor.java b/src/main/java/org/getopt/luke/DocReconstructor.java
@@ -2,6 +2,7 @@
 
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.*;
+import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 
@@ -14,7 +15,7 @@
  * index, and these terms may have been changed (e.g. lowercased, stemmed),
  * and many other input tokens may have been skipped altogether by the
  * Analyzer, when fields were originally added to the index.
- * 
+ *
  * @author ab
  *
  */
@@ -24,7 +25,7 @@ public class DocReconstructor extends Observable {
   private AtomicReader reader = null;
   private int numTerms;
   private Bits live;
-  
+
   /**
    * Prepare a document reconstructor.
    * @param reader IndexReader to read from.
@@ -33,7 +34,7 @@ public class DocReconstructor extends Observable {
   public DocReconstructor(IndexReader reader) throws Exception {
     this(reader, null, -1);
   }
-  
+
   /**
    * Prepare a document reconstructor.
    * @param reader IndexReader to read from.
@@ -65,7 +66,7 @@ public DocReconstructor(IndexReader reader, String[] fieldNames, int numTerms) t
       numTerms = 0;
       Iterator<String> fe = fields.iterator();
       while (fe.hasNext()) {
-          String fld = fe.next();
+        String fld = fe.next();
         Terms t = fields.terms(fld);
         TermsEnum te = t.iterator(null);
         while (te.next() != null) {
@@ -76,7 +77,7 @@ public DocReconstructor(IndexReader reader, String[] fieldNames, int numTerms) t
     }
     live = MultiFields.getLiveDocs(reader);
   }
-  
+
   /**
    * Reconstruct document fields.
    * @param docNum document number. If this document is deleted, but the index
@@ -155,38 +156,45 @@ public Reconstructed reconstruct(int docNum) throws Exception {
 
         DocsAndPositionsEnum newDpe = te.docsAndPositions(live, dpe, 0);
 
-        if (newDpe == null) { // no position info for this field
-            // re-construct without positions
-            GrowableStringArray gsa = (GrowableStringArray)
-                    res.getReconstructedFields().get(fld);
-            if (gsa == null) {
-                gsa = new GrowableStringArray();
-                res.getReconstructedFields().put(fld, gsa);
-            }
-            gsa.append(0, "|", docTerm);
-            // we are done. Move to the next field
-            break;
-        }
-
-        // we should have positions as well for the field, process them accordingly
-        dpe = newDpe;
+        if (newDpe != null) {
+          // we have positions for the field, process them accordingly
+          dpe = newDpe;
 
-        int num = dpe.advance(docNum);
-        if (num != docNum) { // either greater than or NO_MORE_DOCS
-          continue; // no data for this term in this doc
-        }
+          int num = dpe.advance(docNum);
+          if (num != docNum) { // either greater than or NO_MORE_DOCS
+            continue; // no data for this term in this doc
+          }
 
-        // we have computed the value earlier, using the bytesRef data structure
-        docTerm = te.term().utf8ToString();
+          // we have computed the value earlier, using the bytesRef data structure
+          docTerm = te.term().utf8ToString();
 
-        GrowableStringArray gsa = res.getReconstructedFields().get(fld);
-        if (gsa == null) {
-          gsa = new GrowableStringArray();
-          res.getReconstructedFields().put(fld, gsa);
-        }
-        for (int k = 0; k < dpe.freq(); k++) {
-          int pos = dpe.nextPosition();
-          gsa.append(pos, "|", docTerm);
+          GrowableStringArray gsa = res.getReconstructedFields().get(fld);
+          if (gsa == null) {
+            gsa = new GrowableStringArray();
+            res.getReconstructedFields().put(fld, gsa);
+          }
+          for (int k = 0; k < dpe.freq(); k++) {
+            int pos = dpe.nextPosition();
+            gsa.append(pos, "|", docTerm);
+          }
+        } else {
+          // Reconstruct without positions (cross-reference via DocsEnum).
+          // NB if there are multiple terms they will all be added to the array at position 0
+          // (concatenated together, pipe-delimited)
+          DocsEnum docsEnum = te.docs(null, null);
+          if (docsEnum != null) {
+            int termDoc;
+            while ((termDoc = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
+              if (termDoc == docNum) {
+                GrowableStringArray gsa = res.getReconstructedFields().get(fld);
+                if (gsa == null) {
+                  gsa = new GrowableStringArray();
+                  res.getReconstructedFields().put(fld, gsa);
+                }
+                gsa.append(0, "|", docTerm);
+              }
+            }
+          }
         }
       }
     }
@@ -196,7 +204,7 @@ public Reconstructed reconstruct(int docNum) throws Exception {
     notifyObservers(progress);
     return res;
   }
-  
+
   /**
    * This class represents a reconstructed document.
    * @author ab
@@ -209,18 +217,18 @@ public Reconstructed() {
       storedFields = new HashMap<String, IndexableField[]>();
       reconstructedFields = new HashMap<String, GrowableStringArray>();
     }
-    
+
     /**
      * Construct an instance of this class using existing field data.
      * @param storedFields field data of stored fields
      * @param reconstructedFields field data of unstored fields
      */
     public Reconstructed(Map<String, IndexableField[]> storedFields,
-        Map<String, GrowableStringArray> reconstructedFields) {
+                         Map<String, GrowableStringArray> reconstructedFields) {
       this.storedFields = storedFields;
       this.reconstructedFields = reconstructedFields;
     }
-    
+
     /**
      * Get an alphabetically sorted list of field names.
      */
@@ -233,7 +241,7 @@ public List<String> getFieldNames() {
       Collections.sort(res);
       return res;
     }
-    
+
     public boolean hasField(String name) {
       return storedFields.containsKey(name) || reconstructedFields.containsKey(name);
     }

diff --git a/src/test/java/org.apache.lucene.index/IndexTester2.java b/src/test/java/org.apache.lucene.index/IndexTester2.java
@@ -0,0 +1,102 @@
+package org.apache.lucene.index;
+
+import junit.framework.TestCase;
+import org.apache.lucene.analysis.standard.UAX29URLEmailAnalyzer;
+import org.apache.lucene.document.*;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.NIOFSDirectory;
+import org.apache.lucene.util.Version;
+import org.getopt.luke.DocReconstructor;
+import org.getopt.luke.IndexInfo;
+
+import java.io.File;
+
+/**
+ * Created by cbamford on 19/11/2019.
+ * Tests that unstored fields with no position info are reconstructed correctly.
+ * For completeness it also checks the 3 other field types.
+ */
+public class IndexTester2 extends TestCase {
+
+    private String indexPath = "src/test/indices/lukeindex2";
+    private IndexWriterConfig indexCfg;
+    private Directory directory;
+    private DocReconstructor recon;
+
+    @Override
+    protected void setUp() throws Exception {
+        super.setUp();
+        directory = NIOFSDirectory.open(new File(indexPath));
+        populate();
+    }
+
+    @Override
+    protected void tearDown() throws Exception {
+        super.tearDown();
+        if (directory != null) directory.close();
+    }
+
+    public void testDummy() {
+        assertTrue(true == true);
+    }
+
+    public void testVerifyReconstructionOfMultipleFieldTypesAcrossMultipleDocs() throws Exception {
+
+        // Check doc 1
+        DocReconstructor.Reconstructed reconstructed = recon.reconstruct(0);
+        assertEquals("value1", (reconstructed.getStoredFields().get("stored"))[0].stringValue());
+        assertEquals("value1", reconstructed.getReconstructedFields().get("stored+tvs").get(0));
+        assertEquals("value1", reconstructed.getReconstructedFields().get("unstored-posns").get(0));
+        assertEquals("value1", reconstructed.getReconstructedFields().get("unstored+posns").get(0));
+
+        // Check doc 2
+        reconstructed = recon.reconstruct(1);
+        assertEquals("value2", (reconstructed.getStoredFields().get("stored"))[0].stringValue());
+        assertEquals("value2", reconstructed.getReconstructedFields().get("stored+tvs").get(0));
+        assertEquals("value2", reconstructed.getReconstructedFields().get("unstored-posns").get(0));
+        assertEquals("value2", reconstructed.getReconstructedFields().get("unstored+posns").get(0));
+    }
+
+    private void populate() throws Exception {
+        // create an index
+        indexCfg = new IndexWriterConfig(Version.LUCENE_4_10_3, new UAX29URLEmailAnalyzer());
+        indexCfg.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
+
+        IndexWriter writer = new IndexWriter(directory, indexCfg);
+        FieldType tvFtype = createUnstoredWithTermVectorsFieldType();
+
+        Document doc = new Document();
+        doc.add(new TextField("stored", "value1", Field.Store.YES));
+        doc.add(new Field("stored+tvs", "value1", tvFtype));
+        doc.add(new TextField("unstored+posns", "value1", Field.Store.NO));
+        doc.add(new StringField("unstored-posns", "value1", Field.Store.NO));
+        writer.addDocument(doc);
+
+        doc = new Document();
+        doc.add(new TextField("stored", "value2", Field.Store.YES));
+        doc.add(new Field("stored+tvs", "value2", tvFtype));
+        doc.add(new TextField("unstored+posns", "value2", Field.Store.NO));
+        doc.add(new StringField("unstored-posns", "value2", Field.Store.NO));
+        writer.addDocument(doc);
+
+        writer.close();
+
+        IndexReader ir = DirectoryReader.open(directory);
+        IndexInfo idxInfo = new IndexInfo(ir, indexPath);
+        String[] idxFields = idxInfo.getFieldNames().toArray(new String[0]);
+
+        recon = new DocReconstructor(ir, idxFields, idxInfo.getNumTerms());
+    }
+
+    private FieldType createUnstoredWithTermVectorsFieldType() {
+        FieldType fType = new FieldType();
+        fType.setStored(false);
+        fType.setIndexed(true);
+        fType.setTokenized(true);
+        fType.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+        fType.setStoreTermVectors(true);
+        fType.setStoreTermVectorOffsets(true);
+        fType.setStoreTermVectorPositions(true);
+        return fType;
+    }
+}