Skip to content
This repository has been archived by the owner on Sep 21, 2021. It is now read-only.

Commit

Permalink
Merge pull request #178 from chris-bamford/luke-4.10.4-field-reconstr…
Browse files Browse the repository at this point in the history
…uction-2

Issue #175: Offered improvement to reconstruction of unstored fields …
  • Loading branch information
DmitryKey authored Dec 16, 2019
2 parents 13f7243 + d377c7f commit 263ab99
Show file tree
Hide file tree
Showing 3 changed files with 151 additions and 40 deletions.
3 changes: 2 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

<groupId>luke</groupId>
<artifactId>luke</artifactId>
<version>${lucene.version}</version>
<version>4.10.4</version>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
Expand All @@ -20,6 +20,7 @@
<maven-shade-plugin.version>2.3</maven-shade-plugin.version>
<maven-compiler-plugin.version>3.1</maven-compiler-plugin.version>
<maven-dependency-plugin.version>2.8</maven-dependency-plugin.version>
<maven-assembly-plugin.version>2.2</maven-assembly-plugin.version>
<maven-jar-plugin.version>2.4</maven-jar-plugin.version>
<maven-source-plugin.version>2.2.1</maven-source-plugin.version>
<exec-maven-plugin.version>1.2.1</exec-maven-plugin.version>
Expand Down
86 changes: 47 additions & 39 deletions src/main/java/org/getopt/luke/DocReconstructor.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import org.apache.lucene.document.Document;
import org.apache.lucene.index.*;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;

Expand All @@ -14,7 +15,7 @@
* index, and these terms may have been changed (e.g. lowercased, stemmed),
* and many other input tokens may have been skipped altogether by the
* Analyzer, when fields were originally added to the index.
*
*
* @author ab
*
*/
Expand All @@ -24,7 +25,7 @@ public class DocReconstructor extends Observable {
private AtomicReader reader = null;
private int numTerms;
private Bits live;

/**
* Prepare a document reconstructor.
* @param reader IndexReader to read from.
Expand All @@ -33,7 +34,7 @@ public class DocReconstructor extends Observable {
public DocReconstructor(IndexReader reader) throws Exception {
this(reader, null, -1);
}

/**
* Prepare a document reconstructor.
* @param reader IndexReader to read from.
Expand Down Expand Up @@ -65,7 +66,7 @@ public DocReconstructor(IndexReader reader, String[] fieldNames, int numTerms) t
numTerms = 0;
Iterator<String> fe = fields.iterator();
while (fe.hasNext()) {
String fld = fe.next();
String fld = fe.next();
Terms t = fields.terms(fld);
TermsEnum te = t.iterator(null);
while (te.next() != null) {
Expand All @@ -76,7 +77,7 @@ public DocReconstructor(IndexReader reader, String[] fieldNames, int numTerms) t
}
live = MultiFields.getLiveDocs(reader);
}

/**
* Reconstruct document fields.
* @param docNum document number. If this document is deleted, but the index
Expand Down Expand Up @@ -155,38 +156,45 @@ public Reconstructed reconstruct(int docNum) throws Exception {

DocsAndPositionsEnum newDpe = te.docsAndPositions(live, dpe, 0);

if (newDpe == null) { // no position info for this field
// re-construct without positions
GrowableStringArray gsa = (GrowableStringArray)
res.getReconstructedFields().get(fld);
if (gsa == null) {
gsa = new GrowableStringArray();
res.getReconstructedFields().put(fld, gsa);
}
gsa.append(0, "|", docTerm);
// we are done. Move to the next field
break;
}

// we should have positions as well for the field, process them accordingly
dpe = newDpe;
if (newDpe != null) {
// we have positions for the field, process them accordingly
dpe = newDpe;

int num = dpe.advance(docNum);
if (num != docNum) { // either greater than or NO_MORE_DOCS
continue; // no data for this term in this doc
}
int num = dpe.advance(docNum);
if (num != docNum) { // either greater than or NO_MORE_DOCS
continue; // no data for this term in this doc
}

// we have computed the value earlier, using the bytesRef data structure
docTerm = te.term().utf8ToString();
// we have computed the value earlier, using the bytesRef data structure
docTerm = te.term().utf8ToString();

GrowableStringArray gsa = res.getReconstructedFields().get(fld);
if (gsa == null) {
gsa = new GrowableStringArray();
res.getReconstructedFields().put(fld, gsa);
}
for (int k = 0; k < dpe.freq(); k++) {
int pos = dpe.nextPosition();
gsa.append(pos, "|", docTerm);
GrowableStringArray gsa = res.getReconstructedFields().get(fld);
if (gsa == null) {
gsa = new GrowableStringArray();
res.getReconstructedFields().put(fld, gsa);
}
for (int k = 0; k < dpe.freq(); k++) {
int pos = dpe.nextPosition();
gsa.append(pos, "|", docTerm);
}
} else {
// Reconstruct without positions (cross-reference via DocsEnum).
// NB if there are multiple terms they will all be added to the array at position 0
// (concatenated together, pipe-delimited)
DocsEnum docsEnum = te.docs(null, null);
if (docsEnum != null) {
int termDoc;
while ((termDoc = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (termDoc == docNum) {
GrowableStringArray gsa = res.getReconstructedFields().get(fld);
if (gsa == null) {
gsa = new GrowableStringArray();
res.getReconstructedFields().put(fld, gsa);
}
gsa.append(0, "|", docTerm);
}
}
}
}
}
}
Expand All @@ -196,7 +204,7 @@ public Reconstructed reconstruct(int docNum) throws Exception {
notifyObservers(progress);
return res;
}

/**
* This class represents a reconstructed document.
* @author ab
Expand All @@ -209,18 +217,18 @@ public Reconstructed() {
storedFields = new HashMap<String, IndexableField[]>();
reconstructedFields = new HashMap<String, GrowableStringArray>();
}

/**
* Construct an instance of this class using existing field data.
* @param storedFields field data of stored fields
* @param reconstructedFields field data of unstored fields
*/
public Reconstructed(Map<String, IndexableField[]> storedFields,
Map<String, GrowableStringArray> reconstructedFields) {
Map<String, GrowableStringArray> reconstructedFields) {
this.storedFields = storedFields;
this.reconstructedFields = reconstructedFields;
}

/**
* Get an alphabetically sorted list of field names.
*/
Expand All @@ -233,7 +241,7 @@ public List<String> getFieldNames() {
Collections.sort(res);
return res;
}

public boolean hasField(String name) {
return storedFields.containsKey(name) || reconstructedFields.containsKey(name);
}
Expand Down
102 changes: 102 additions & 0 deletions src/test/java/org.apache.lucene.index/IndexTester2.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
package org.apache.lucene.index;

import junit.framework.TestCase;
import org.apache.lucene.analysis.standard.UAX29URLEmailAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.Version;
import org.getopt.luke.DocReconstructor;
import org.getopt.luke.IndexInfo;

import java.io.File;

/**
* Created by cbamford on 19/11/2019.
* Tests that unstored fields with no position info are reconstructed correctly.
* For completeness it also checks the 3 other field types.
*/
public class IndexTester2 extends TestCase {

private String indexPath = "src/test/indices/lukeindex2";
private IndexWriterConfig indexCfg;
private Directory directory;
private DocReconstructor recon;

@Override
protected void setUp() throws Exception {
super.setUp();
directory = NIOFSDirectory.open(new File(indexPath));
populate();
}

@Override
protected void tearDown() throws Exception {
super.tearDown();
if (directory != null) directory.close();
}

public void testDummy() {
assertTrue(true == true);
}

public void testVerifyReconstructionOfMultipleFieldTypesAcrossMultipleDocs() throws Exception {

// Check doc 1
DocReconstructor.Reconstructed reconstructed = recon.reconstruct(0);
assertEquals("value1", (reconstructed.getStoredFields().get("stored"))[0].stringValue());
assertEquals("value1", reconstructed.getReconstructedFields().get("stored+tvs").get(0));
assertEquals("value1", reconstructed.getReconstructedFields().get("unstored-posns").get(0));
assertEquals("value1", reconstructed.getReconstructedFields().get("unstored+posns").get(0));

// Check doc 2
reconstructed = recon.reconstruct(1);
assertEquals("value2", (reconstructed.getStoredFields().get("stored"))[0].stringValue());
assertEquals("value2", reconstructed.getReconstructedFields().get("stored+tvs").get(0));
assertEquals("value2", reconstructed.getReconstructedFields().get("unstored-posns").get(0));
assertEquals("value2", reconstructed.getReconstructedFields().get("unstored+posns").get(0));
}

private void populate() throws Exception {
// create an index
indexCfg = new IndexWriterConfig(Version.LUCENE_4_10_3, new UAX29URLEmailAnalyzer());
indexCfg.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

IndexWriter writer = new IndexWriter(directory, indexCfg);
FieldType tvFtype = createUnstoredWithTermVectorsFieldType();

Document doc = new Document();
doc.add(new TextField("stored", "value1", Field.Store.YES));
doc.add(new Field("stored+tvs", "value1", tvFtype));
doc.add(new TextField("unstored+posns", "value1", Field.Store.NO));
doc.add(new StringField("unstored-posns", "value1", Field.Store.NO));
writer.addDocument(doc);

doc = new Document();
doc.add(new TextField("stored", "value2", Field.Store.YES));
doc.add(new Field("stored+tvs", "value2", tvFtype));
doc.add(new TextField("unstored+posns", "value2", Field.Store.NO));
doc.add(new StringField("unstored-posns", "value2", Field.Store.NO));
writer.addDocument(doc);

writer.close();

IndexReader ir = DirectoryReader.open(directory);
IndexInfo idxInfo = new IndexInfo(ir, indexPath);
String[] idxFields = idxInfo.getFieldNames().toArray(new String[0]);

recon = new DocReconstructor(ir, idxFields, idxInfo.getNumTerms());
}

private FieldType createUnstoredWithTermVectorsFieldType() {
FieldType fType = new FieldType();
fType.setStored(false);
fType.setIndexed(true);
fType.setTokenized(true);
fType.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
fType.setStoreTermVectors(true);
fType.setStoreTermVectorOffsets(true);
fType.setStoreTermVectorPositions(true);
return fType;
}
}

0 comments on commit 263ab99

Please sign in to comment.