diff --git a/src/main/java/io/anserini/collection/JsonSparseVectorCollection.java b/src/main/java/io/anserini/collection/JsonSparseVectorCollection.java new file mode 100644 index 0000000000..303e636021 --- /dev/null +++ b/src/main/java/io/anserini/collection/JsonSparseVectorCollection.java @@ -0,0 +1,84 @@ +/* + * Anserini: A Lucene toolkit for reproducible information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.collection; + +import com.fasterxml.jackson.databind.JsonNode; + +import java.io.BufferedReader; +import java.io.IOException; +import java.nio.file.Path; +import java.util.HashMap; +import java.util.Map; + +/** + * A JSON sparse document collection for learned sparse retrieval + */ +public class JsonSparseVectorCollection extends DocumentCollection { + public JsonSparseVectorCollection(Path path) { + this.path = path; + } + + @Override + public FileSegment createFileSegment(BufferedReader bufferedReader) throws IOException { + return new JsonSparseVectorCollection.Segment<>(bufferedReader); + } + + @Override + public FileSegment createFileSegment(Path path) throws IOException { + return new JsonSparseVectorCollection.Segment<>(path); + } + + public static class Segment extends JsonCollection.Segment { + public Segment(Path path) throws IOException { + super(path); + } + + public Segment(BufferedReader bufferedReader) throws IOException { + super(bufferedReader); + } + + @Override + protected Document createNewDocument(JsonNode json) { + return new Document(json); + } + } + + public static class Document extends JsonCollection.Document implements SourceSparseVectorDocument { + private Map vector; + public Document(JsonNode json) { + super(json); + this.vector = new HashMap<>(); + // We're going to take the map associated with "vector" and generate pseudo-document. + JsonNode vectorNode = json.get("vector"); + + // Iterate through the features: + final StringBuilder sb = new StringBuilder(); + vectorNode.fields().forEachRemaining( e -> { + Float cnt = e.getValue().floatValue(); + // Generate pseudo-document by appending the feature cnt times, + // where cnt is the value of the feature + this.vector.put(e.getKey(), cnt); + }); + } + + @Override + public Map vector() { + return this.vector; + } + + } +} diff --git a/src/main/java/io/anserini/collection/SourceSparseVectorDocument.java b/src/main/java/io/anserini/collection/SourceSparseVectorDocument.java new file mode 100644 index 0000000000..9a4681e4e2 --- /dev/null +++ b/src/main/java/io/anserini/collection/SourceSparseVectorDocument.java @@ -0,0 +1,32 @@ +/* + * Anserini: A Lucene toolkit for reproducible information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.collection; + +import java.util.Map; + +/** + * A raw document from a collection. A {@code SourceDocument} is explicitly distinguish a from a + * Lucene {@link org.apache.lucene.document.Document}, which is the Lucene representation that + * can be directly inserted into an index. + */ +public interface SourceSparseVectorDocument { + /** + * Return the vector containing term and weight + * @return a map that map term to weight + */ + Map vector(); +} diff --git a/src/main/java/io/anserini/index/generator/SparseVectorDocumentGenerator.java b/src/main/java/io/anserini/index/generator/SparseVectorDocumentGenerator.java new file mode 100644 index 0000000000..52e17fe002 --- /dev/null +++ b/src/main/java/io/anserini/index/generator/SparseVectorDocumentGenerator.java @@ -0,0 +1,80 @@ +/* + * Anserini: A Lucene toolkit for reproducible information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.index.generator; + +import io.anserini.collection.InvalidContentsException; +import io.anserini.collection.SourceDocument; +import io.anserini.collection.SourceSparseVectorDocument; +import io.anserini.index.Constants; +import io.anserini.index.IndexCollection; +import org.apache.lucene.document.*; +import org.apache.lucene.util.BytesRef; + +import java.util.Map; + +/** + * Converts a {@link SourceDocument} into a Lucene {@link Document}, ready to be indexed. + * + * @param type of the source document + */ +public class SparseVectorDocumentGenerator implements LuceneDocumentGenerator { + protected IndexCollection.Args args; + + protected SparseVectorDocumentGenerator() { + + } + /** + * Constructor with config and counters + * + * @param args configuration arguments + */ + public SparseVectorDocumentGenerator(IndexCollection.Args args) { + this.args = args; + } + + @Override + public Document createDocument(T src) throws GeneratorException { + String id = src.id(); + Map vector; + try { + vector = src.vector(); + } catch (InvalidContentsException e) { + // Catch and rethrow; indexer will eat the exception at top level and increment counters accordingly. + throw new InvalidDocumentException(); + } + + if (vector.size() == 0) { + throw new EmptyDocumentException(); + } + + // Make a new, empty document. + final Document document = new Document(); + + // Store the collection docid. + document.add(new StringField(Constants.ID, id, Field.Store.YES)); + // This is needed to break score ties by docid. + document.add(new BinaryDocValuesField(Constants.ID, new BytesRef(id))); + + if (args.storeRaw) { + document.add(new StoredField(Constants.RAW, src.raw())); + } + for (String term : vector.keySet()){ + document.add(new FeatureField(Constants.CONTENTS, term, vector.get(term))); + } + return document; + } +} diff --git a/src/main/java/io/anserini/search/SearchCollection.java b/src/main/java/io/anserini/search/SearchCollection.java index 0c25a9fcde..fe41240a8a 100644 --- a/src/main/java/io/anserini/search/SearchCollection.java +++ b/src/main/java/io/anserini/search/SearchCollection.java @@ -35,6 +35,7 @@ import io.anserini.rerank.lib.Rm3Reranker; import io.anserini.rerank.lib.RocchioReranker; import io.anserini.rerank.lib.ScoreTiesAdjusterReranker; +import io.anserini.search.query.BagOfWordsQueryGenerator; import io.anserini.search.query.QueryGenerator; import io.anserini.search.query.SdmQueryGenerator; import io.anserini.search.similarity.AccurateBM25Similarity; @@ -1250,6 +1251,8 @@ public ScoredDocuments search(IndexSearcher searcher, K qid, String queryStr if (args.sdm) { query = new SdmQueryGenerator(args.sdm_tw, args.sdm_ow, args.sdm_uw).buildQuery(Constants.CONTENTS, analyzer, queryString); + } else if (args.impact){ + query = new BagOfWordsQueryGenerator().buildFeatureQuery(Constants.CONTENTS, analyzer, queryString); } else { QueryGenerator generator; try { diff --git a/src/main/java/io/anserini/search/query/BagOfWordsQueryGenerator.java b/src/main/java/io/anserini/search/query/BagOfWordsQueryGenerator.java index e1bf6e16b1..903ec37c27 100644 --- a/src/main/java/io/anserini/search/query/BagOfWordsQueryGenerator.java +++ b/src/main/java/io/anserini/search/query/BagOfWordsQueryGenerator.java @@ -17,7 +17,9 @@ package io.anserini.search.query; import io.anserini.analysis.AnalyzerUtils; +import io.anserini.index.Constants; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.document.FeatureField; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; @@ -25,6 +27,7 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.function.Function; @@ -33,7 +36,7 @@ /* * Bag of Terms query builder */ -public class BagOfWordsQueryGenerator extends QueryGenerator { +public class BagOfWordsQueryGenerator extends QueryGenerator implements FeatureGenerator { @Override public Query buildQuery(String field, Analyzer analyzer, String queryText) { List tokens = AnalyzerUtils.analyze(analyzer, queryText); @@ -47,6 +50,34 @@ public Query buildQuery(String field, Analyzer analyzer, String queryText) { return builder.build(); } + public Query buildFeatureQuery(String field, Analyzer analyzer, String queryText) { + List tokens = AnalyzerUtils.analyze(analyzer, queryText); + Map collect = tokens.stream() + .collect(Collectors.groupingBy(Function.identity(), Collectors.counting())); + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + Map normalizedScore = new HashMap<>(); + float maxWeight = 0; + for (String t : collect.keySet()){ + float s = (float) collect.get(t); + normalizedScore.put(t, s); + if (s > maxWeight) { + maxWeight = s; + } + } + // The maximum weight for FeatureQuery is 64, this constraint could be lifted but might not be necessary. + // Note: This normalization makes the scores between different queries not comparable + if (maxWeight > 64){ + for (String t : normalizedScore.keySet()){ + normalizedScore.put(t,normalizedScore.get(t)/maxWeight* (float)64.0); + } + } + + for (String t : normalizedScore.keySet()) { + builder.add(FeatureField.newLinearQuery(Constants.CONTENTS, t, normalizedScore.get(t)),BooleanClause.Occur.SHOULD); + } + return builder.build(); + } + @Override public Query buildQuery(Map fields, Analyzer analyzer, String queryText) { BooleanQuery.Builder builder = new BooleanQuery.Builder(); diff --git a/src/main/java/io/anserini/search/query/FeatureGenerator.java b/src/main/java/io/anserini/search/query/FeatureGenerator.java new file mode 100644 index 0000000000..7c6ead9bc0 --- /dev/null +++ b/src/main/java/io/anserini/search/query/FeatureGenerator.java @@ -0,0 +1,15 @@ +package io.anserini.search.query; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.search.Query; + +public interface FeatureGenerator{ + /** + * Generate queries with terms as features + * @param field + * @param analyzer + * @param queryText + * @return + */ + Query buildFeatureQuery(String field, Analyzer analyzer, String queryText); +}