diff --git a/src/main/java/io/anserini/index/IndexReaderUtils.java b/src/main/java/io/anserini/index/IndexReaderUtils.java index bdc6564759..c46b40064d 100755 --- a/src/main/java/io/anserini/index/IndexReaderUtils.java +++ b/src/main/java/io/anserini/index/IndexReaderUtils.java @@ -20,6 +20,8 @@ import io.anserini.search.SearchArgs; import io.anserini.search.query.BagOfWordsQueryGenerator; import io.anserini.search.query.PhraseQueryGenerator; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; @@ -65,6 +67,7 @@ * This class provides a lot of functionality that is exposed in Python via Pyserini. */ public class IndexReaderUtils { + private static final Logger LOG = LogManager.getLogger(IndexReaderUtils.class); /** * An individual posting in a postings list. Note that this class is used primarily for inspecting @@ -726,7 +729,129 @@ public static float computeQueryDocumentScoreWithSimilarityAndAnalyzer( return rs.scoreDocs.length == 0 ? 0 : rs.scoreDocs[0].score - 1; } - // TODO: Write a variant of computeQueryDocumentScore that takes a set of documents. + /** + * Computes the scores of a batch of documents with respect to a query given a scoring function and an analyzer. + * + * @param reader index reader + * @param docids A list of docids of the documents to score + * @param q query + * @return a map of document ids to their scores with respect to the query + * @throws IOException if error encountered during query + */ + public static Map batchComputeQueryDocumentScore( + IndexReader reader, List docids, String q) + throws IOException { + + SearchArgs args = new SearchArgs(); + return batchComputeQueryDocumentScoreWithSimilarityAndAnalyzer(reader, docids, q, + new BM25Similarity(Float.parseFloat(args.bm25_k1[0]), Float.parseFloat(args.bm25_b[0])), + IndexCollection.DEFAULT_ANALYZER); + } + + + /** + * Computes the scores of a batch of documents with respect to a query given a scoring function and an analyzer. + * + * @param reader index reader + * @param docids A list of docids of the documents to score + * @param q query + * @param similarity scoring function + * @return a map of document ids to their scores with respect to the query + * @throws IOException if error encountered during query + */ + public static Map batchComputeQueryDocumentScore( + IndexReader reader, List docids, String q, Similarity similarity) + throws IOException { + + return batchComputeQueryDocumentScoreWithSimilarityAndAnalyzer(reader, docids, q, similarity, + IndexCollection.DEFAULT_ANALYZER); + } + + + /** + * Computes the scores of a batch of documents with respect to a query given a scoring function and an analyzer. + * + * @param reader index reader + * @param docids A list of docids of the documents to score + * @param q query + * @param analyzer analyzer to use + * @return a map of document ids to their scores with respect to the query + * @throws IOException if error encountered during query + */ + public static Map batchComputeQueryDocumentScore( + IndexReader reader, List docids, String q, Analyzer analyzer) + throws IOException { + + SearchArgs args = new SearchArgs(); + return batchComputeQueryDocumentScoreWithSimilarityAndAnalyzer(reader, docids, q, + new BM25Similarity(Float.parseFloat(args.bm25_k1[0]), Float.parseFloat(args.bm25_b[0])), + analyzer); + } + + + /** + * Computes the scores of a batch of documents with respect to a query given a scoring function and an analyzer. + * + * @param reader index reader + * @param docids A list of docids of the documents to score + * @param q query + * @param similarity scoring function + * @param analyzer analyzer to use + * @return a map of document ids to their scores with respect to the query + * @throws IOException if error encountered during query + */ + public static Map batchComputeQueryDocumentScore( + IndexReader reader, List docids, String q, Similarity similarity, Analyzer analyzer) + throws IOException { + return batchComputeQueryDocumentScoreWithSimilarityAndAnalyzer(reader, docids, q, similarity, analyzer); + } + + + /** + * Computes the scores of a batch of documents with respect to a query given a scoring function and an analyzer. + * + * @param reader index reader + * @param docids A list of docids of the documents to score + * @param q query + * @param similarity scoring function + * @param analyzer analyzer to use + * @return a map of document ids to their scores with respect to the query + * @throws IOException if error encountered during query + */ + public static Map batchComputeQueryDocumentScoreWithSimilarityAndAnalyzer( + IndexReader reader, List docids, String q, Similarity similarity, Analyzer analyzer) + throws IOException { + // We compute the query-document score by issuing the query with additional filters that restricts + // consideration to the set of docids provided, and then returning the retrieval score. + + IndexSearcher searcher = new IndexSearcher(reader); + searcher.setSimilarity(similarity); + + HashMap results = new HashMap<>(); + + Query query = new BagOfWordsQueryGenerator().buildQuery(IndexArgs.CONTENTS, analyzer, q); + + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + for (String docid: docids){ + // Setting default result value for all docids. + results.put(docid, 0.0f); + Query filterQuery = new ConstantScoreQuery(new TermQuery(new Term(IndexArgs.ID, docid))); + builder.add(filterQuery, BooleanClause.Occur.SHOULD); + } + builder.add(query, BooleanClause.Occur.MUST); + Query finalQuery = builder.build(); + + TopDocs rs = searcher.search(finalQuery, docids.size()); + + for (int i=0; i < rs.scoreDocs.length; i++){ + String docid = convertLuceneDocidToDocid(reader, rs.scoreDocs[i].doc); + // Removing 1 for the ConstantScoreQuery. + float result = rs.scoreDocs[i].score -1; + results.put(docid, result); + } + + return results; + } /** * Converts a collection docid to a Lucene internal docid. diff --git a/src/test/java/io/anserini/index/IndexReaderUtilsTest.java b/src/test/java/io/anserini/index/IndexReaderUtilsTest.java index a785235c40..c31236a6d5 100755 --- a/src/test/java/io/anserini/index/IndexReaderUtilsTest.java +++ b/src/test/java/io/anserini/index/IndexReaderUtilsTest.java @@ -21,6 +21,7 @@ import io.anserini.analysis.DefaultEnglishAnalyzer; import io.anserini.search.SearchArgs; import io.anserini.search.SimpleSearcher; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexReader; @@ -40,6 +41,7 @@ import java.io.ByteArrayOutputStream; import java.io.PrintStream; +import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -535,6 +537,50 @@ public void testComputeQueryDocumentScore() throws Exception { dir.close(); } + @Test + public void testBatchComputeQueryDocumentScore() throws Exception { + SimpleSearcher searcher1 = new SimpleSearcher(tempDir1.toString()); + // Using analyzer asides the default for second searcher. + Analyzer stemAnalyzer = DefaultEnglishAnalyzer.newStemmingInstance("krovertz"); + SimpleSearcher searcher2 = new SimpleSearcher(tempDir1.toString(), stemAnalyzer); + Directory dir = FSDirectory.open(tempDir1); + IndexReader reader = DirectoryReader.open(dir); + Similarity similarity = new BM25Similarity(0.9f, 0.4f); + + // A bunch of test queries... + String[] queries = {"text city", "text", "city"}; + + for (String query: queries) { + SimpleSearcher.Result[] results1 = searcher1.search(query); + + List docids = new ArrayList(); + for (SimpleSearcher.Result result: results1){ + docids.add(result.docid); + } + + Map batchScore1 = IndexReaderUtils.batchComputeQueryDocumentScore(reader, docids, query, similarity); + for (SimpleSearcher.Result result: results1){ + assertEquals(batchScore1.get(result.docid), result.score, 10e-5); + } + + SimpleSearcher.Result[] results2 = searcher2.search(query); + Map batchScore2 = IndexReaderUtils.batchComputeQueryDocumentScore(reader, docids, query, similarity, stemAnalyzer); + for (SimpleSearcher.Result result: results2){ + assertEquals(batchScore2.get(result.docid), result.score, 10e-5); + } + + // This is hard coded - doc3 isn't retrieved by any of the queries. + String fakeId = "doc3"; + docids = List.of(fakeId); + Map batchScore = IndexReaderUtils.batchComputeQueryDocumentScore( + reader, docids, query, similarity); + assertEquals(0.0f, batchScore.get(fakeId), 10e-6); + } + + reader.close(); + dir.close(); + } + @Test public void testGetIndexStats() throws Exception { Directory dir = FSDirectory.open(tempDir1);