From d377c7f7de17d1aa74ba74f33794e56ca7c5ed10 Mon Sep 17 00:00:00 2001 From: Chris Bamford Date: Mon, 16 Dec 2019 15:42:22 +0000 Subject: [PATCH] Issue #175: Offered improvement to reconstruction of unstored fields with no position information --- pom.xml | 3 +- .../org/getopt/luke/DocReconstructor.java | 86 ++++++++------- .../org.apache.lucene.index/IndexTester2.java | 102 ++++++++++++++++++ 3 files changed, 151 insertions(+), 40 deletions(-) create mode 100644 src/test/java/org.apache.lucene.index/IndexTester2.java diff --git a/pom.xml b/pom.xml index 1d2408d..d171acd 100755 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ luke luke - ${lucene.version} + 4.10.4 UTF-8 @@ -20,6 +20,7 @@ 2.3 3.1 2.8 + 2.2 2.4 2.2.1 1.2.1 diff --git a/src/main/java/org/getopt/luke/DocReconstructor.java b/src/main/java/org/getopt/luke/DocReconstructor.java index deef739..600e706 100755 --- a/src/main/java/org/getopt/luke/DocReconstructor.java +++ b/src/main/java/org/getopt/luke/DocReconstructor.java @@ -2,6 +2,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.index.*; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; @@ -14,7 +15,7 @@ * index, and these terms may have been changed (e.g. lowercased, stemmed), * and many other input tokens may have been skipped altogether by the * Analyzer, when fields were originally added to the index. - * + * * @author ab * */ @@ -24,7 +25,7 @@ public class DocReconstructor extends Observable { private AtomicReader reader = null; private int numTerms; private Bits live; - + /** * Prepare a document reconstructor. * @param reader IndexReader to read from. @@ -33,7 +34,7 @@ public class DocReconstructor extends Observable { public DocReconstructor(IndexReader reader) throws Exception { this(reader, null, -1); } - + /** * Prepare a document reconstructor. * @param reader IndexReader to read from. @@ -65,7 +66,7 @@ public DocReconstructor(IndexReader reader, String[] fieldNames, int numTerms) t numTerms = 0; Iterator fe = fields.iterator(); while (fe.hasNext()) { - String fld = fe.next(); + String fld = fe.next(); Terms t = fields.terms(fld); TermsEnum te = t.iterator(null); while (te.next() != null) { @@ -76,7 +77,7 @@ public DocReconstructor(IndexReader reader, String[] fieldNames, int numTerms) t } live = MultiFields.getLiveDocs(reader); } - + /** * Reconstruct document fields. * @param docNum document number. If this document is deleted, but the index @@ -155,38 +156,45 @@ public Reconstructed reconstruct(int docNum) throws Exception { DocsAndPositionsEnum newDpe = te.docsAndPositions(live, dpe, 0); - if (newDpe == null) { // no position info for this field - // re-construct without positions - GrowableStringArray gsa = (GrowableStringArray) - res.getReconstructedFields().get(fld); - if (gsa == null) { - gsa = new GrowableStringArray(); - res.getReconstructedFields().put(fld, gsa); - } - gsa.append(0, "|", docTerm); - // we are done. Move to the next field - break; - } - - // we should have positions as well for the field, process them accordingly - dpe = newDpe; + if (newDpe != null) { + // we have positions for the field, process them accordingly + dpe = newDpe; - int num = dpe.advance(docNum); - if (num != docNum) { // either greater than or NO_MORE_DOCS - continue; // no data for this term in this doc - } + int num = dpe.advance(docNum); + if (num != docNum) { // either greater than or NO_MORE_DOCS + continue; // no data for this term in this doc + } - // we have computed the value earlier, using the bytesRef data structure - docTerm = te.term().utf8ToString(); + // we have computed the value earlier, using the bytesRef data structure + docTerm = te.term().utf8ToString(); - GrowableStringArray gsa = res.getReconstructedFields().get(fld); - if (gsa == null) { - gsa = new GrowableStringArray(); - res.getReconstructedFields().put(fld, gsa); - } - for (int k = 0; k < dpe.freq(); k++) { - int pos = dpe.nextPosition(); - gsa.append(pos, "|", docTerm); + GrowableStringArray gsa = res.getReconstructedFields().get(fld); + if (gsa == null) { + gsa = new GrowableStringArray(); + res.getReconstructedFields().put(fld, gsa); + } + for (int k = 0; k < dpe.freq(); k++) { + int pos = dpe.nextPosition(); + gsa.append(pos, "|", docTerm); + } + } else { + // Reconstruct without positions (cross-reference via DocsEnum). + // NB if there are multiple terms they will all be added to the array at position 0 + // (concatenated together, pipe-delimited) + DocsEnum docsEnum = te.docs(null, null); + if (docsEnum != null) { + int termDoc; + while ((termDoc = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + if (termDoc == docNum) { + GrowableStringArray gsa = res.getReconstructedFields().get(fld); + if (gsa == null) { + gsa = new GrowableStringArray(); + res.getReconstructedFields().put(fld, gsa); + } + gsa.append(0, "|", docTerm); + } + } + } } } } @@ -196,7 +204,7 @@ public Reconstructed reconstruct(int docNum) throws Exception { notifyObservers(progress); return res; } - + /** * This class represents a reconstructed document. * @author ab @@ -209,18 +217,18 @@ public Reconstructed() { storedFields = new HashMap(); reconstructedFields = new HashMap(); } - + /** * Construct an instance of this class using existing field data. * @param storedFields field data of stored fields * @param reconstructedFields field data of unstored fields */ public Reconstructed(Map storedFields, - Map reconstructedFields) { + Map reconstructedFields) { this.storedFields = storedFields; this.reconstructedFields = reconstructedFields; } - + /** * Get an alphabetically sorted list of field names. */ @@ -233,7 +241,7 @@ public List getFieldNames() { Collections.sort(res); return res; } - + public boolean hasField(String name) { return storedFields.containsKey(name) || reconstructedFields.containsKey(name); } diff --git a/src/test/java/org.apache.lucene.index/IndexTester2.java b/src/test/java/org.apache.lucene.index/IndexTester2.java new file mode 100644 index 0000000..bcfbe3d --- /dev/null +++ b/src/test/java/org.apache.lucene.index/IndexTester2.java @@ -0,0 +1,102 @@ +package org.apache.lucene.index; + +import junit.framework.TestCase; +import org.apache.lucene.analysis.standard.UAX29URLEmailAnalyzer; +import org.apache.lucene.document.*; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.NIOFSDirectory; +import org.apache.lucene.util.Version; +import org.getopt.luke.DocReconstructor; +import org.getopt.luke.IndexInfo; + +import java.io.File; + +/** + * Created by cbamford on 19/11/2019. + * Tests that unstored fields with no position info are reconstructed correctly. + * For completeness it also checks the 3 other field types. + */ +public class IndexTester2 extends TestCase { + + private String indexPath = "src/test/indices/lukeindex2"; + private IndexWriterConfig indexCfg; + private Directory directory; + private DocReconstructor recon; + + @Override + protected void setUp() throws Exception { + super.setUp(); + directory = NIOFSDirectory.open(new File(indexPath)); + populate(); + } + + @Override + protected void tearDown() throws Exception { + super.tearDown(); + if (directory != null) directory.close(); + } + + public void testDummy() { + assertTrue(true == true); + } + + public void testVerifyReconstructionOfMultipleFieldTypesAcrossMultipleDocs() throws Exception { + + // Check doc 1 + DocReconstructor.Reconstructed reconstructed = recon.reconstruct(0); + assertEquals("value1", (reconstructed.getStoredFields().get("stored"))[0].stringValue()); + assertEquals("value1", reconstructed.getReconstructedFields().get("stored+tvs").get(0)); + assertEquals("value1", reconstructed.getReconstructedFields().get("unstored-posns").get(0)); + assertEquals("value1", reconstructed.getReconstructedFields().get("unstored+posns").get(0)); + + // Check doc 2 + reconstructed = recon.reconstruct(1); + assertEquals("value2", (reconstructed.getStoredFields().get("stored"))[0].stringValue()); + assertEquals("value2", reconstructed.getReconstructedFields().get("stored+tvs").get(0)); + assertEquals("value2", reconstructed.getReconstructedFields().get("unstored-posns").get(0)); + assertEquals("value2", reconstructed.getReconstructedFields().get("unstored+posns").get(0)); + } + + private void populate() throws Exception { + // create an index + indexCfg = new IndexWriterConfig(Version.LUCENE_4_10_3, new UAX29URLEmailAnalyzer()); + indexCfg.setOpenMode(IndexWriterConfig.OpenMode.CREATE); + + IndexWriter writer = new IndexWriter(directory, indexCfg); + FieldType tvFtype = createUnstoredWithTermVectorsFieldType(); + + Document doc = new Document(); + doc.add(new TextField("stored", "value1", Field.Store.YES)); + doc.add(new Field("stored+tvs", "value1", tvFtype)); + doc.add(new TextField("unstored+posns", "value1", Field.Store.NO)); + doc.add(new StringField("unstored-posns", "value1", Field.Store.NO)); + writer.addDocument(doc); + + doc = new Document(); + doc.add(new TextField("stored", "value2", Field.Store.YES)); + doc.add(new Field("stored+tvs", "value2", tvFtype)); + doc.add(new TextField("unstored+posns", "value2", Field.Store.NO)); + doc.add(new StringField("unstored-posns", "value2", Field.Store.NO)); + writer.addDocument(doc); + + writer.close(); + + IndexReader ir = DirectoryReader.open(directory); + IndexInfo idxInfo = new IndexInfo(ir, indexPath); + String[] idxFields = idxInfo.getFieldNames().toArray(new String[0]); + + recon = new DocReconstructor(ir, idxFields, idxInfo.getNumTerms()); + } + + private FieldType createUnstoredWithTermVectorsFieldType() { + FieldType fType = new FieldType(); + fType.setStored(false); + fType.setIndexed(true); + fType.setTokenized(true); + fType.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + fType.setStoreTermVectors(true); + fType.setStoreTermVectorOffsets(true); + fType.setStoreTermVectorPositions(true); + return fType; + } +}