From 006b934d8bb4c97dc25757dccdf0cbfd8d7d0e10 Mon Sep 17 00:00:00 2001 From: Thong Nguyen Date: Thu, 23 Mar 2023 15:16:16 +0100 Subject: [PATCH 1/5] lsr index with FeatureField --- .../collection/JsonTermWeightCollection.java | 79 +++++++++++++++++++ .../collection/SourceTermWeightDocument.java | 32 ++++++++ .../TermWeightDocumentGenerator.java | 79 +++++++++++++++++++ .../io/anserini/search/SearchCollection.java | 6 +- .../query/BagOfWordsQueryGenerator.java | 33 +++++++- .../search/query/FeatureGenerator.java | 15 ++++ 6 files changed, 242 insertions(+), 2 deletions(-) create mode 100644 src/main/java/io/anserini/collection/JsonTermWeightCollection.java create mode 100644 src/main/java/io/anserini/collection/SourceTermWeightDocument.java create mode 100644 src/main/java/io/anserini/index/generator/TermWeightDocumentGenerator.java create mode 100644 src/main/java/io/anserini/search/query/FeatureGenerator.java diff --git a/src/main/java/io/anserini/collection/JsonTermWeightCollection.java b/src/main/java/io/anserini/collection/JsonTermWeightCollection.java new file mode 100644 index 0000000000..56e50c3b0a --- /dev/null +++ b/src/main/java/io/anserini/collection/JsonTermWeightCollection.java @@ -0,0 +1,79 @@ +/* + * Anserini: A Lucene toolkit for reproducible information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.collection; + +import com.fasterxml.jackson.databind.JsonNode; + +import java.io.BufferedReader; +import java.io.IOException; +import java.nio.file.Path; +import java.util.HashMap; +import java.util.Map; + +/**xxw + * A JSON document collection where the user can specify directly the vector to be indexed. + */ +public class JsonTermWeightCollection extends DocumentCollection { + public JsonTermWeightCollection(Path path) { + this.path = path; + } + + @Override + public FileSegment createFileSegment(BufferedReader bufferedReader) throws IOException { + return new JsonTermWeightCollection.Segment<>(bufferedReader); + } + @Override + public FileSegment createFileSegment(Path path) throws IOException { + return new JsonTermWeightCollection.Segment<>(path); + } + public static class Segment extends JsonCollection.Segment { + public Segment(Path path) throws IOException { + super(path); + } + public Segment(BufferedReader bufferedReader) throws IOException { + super(bufferedReader); + } + @Override + protected Document createNewDocument(JsonNode json) { + return new Document(json); + } + } + + public static class Document extends JsonCollection.Document implements SourceTermWeightDocument { + private Map vector; + public Document(JsonNode json) { + super(json); + this.vector = new HashMap<>(); + // We're going to take the map associated with "vector" and generate pseudo-document. + JsonNode vectorNode = json.get("vector"); + + // Iterate through the features: + final StringBuilder sb = new StringBuilder(); + vectorNode.fields().forEachRemaining( e -> { + Float cnt = e.getValue().floatValue(); + // Generate pseudo-document by appending the feature cnt times, + // where cnt is the value of the feature + this.vector.put(e.getKey(), cnt); + }); + } + + @Override + public Map vector() { + return this.vector; + } + } +} diff --git a/src/main/java/io/anserini/collection/SourceTermWeightDocument.java b/src/main/java/io/anserini/collection/SourceTermWeightDocument.java new file mode 100644 index 0000000000..10f0671fc3 --- /dev/null +++ b/src/main/java/io/anserini/collection/SourceTermWeightDocument.java @@ -0,0 +1,32 @@ +/* + * Anserini: A Lucene toolkit for reproducible information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.collection; + +import java.util.Map; + +/** + * A raw document from a collection. A {@code SourceDocument} is explicitly distinguish a from a + * Lucene {@link org.apache.lucene.document.Document}, which is the Lucene representation that + * can be directly inserted into an index. + */ +public interface SourceTermWeightDocument{ + /** + * Return the vector containing term and weight + * @return a map that map term to weight + */ + Map vector(); +} diff --git a/src/main/java/io/anserini/index/generator/TermWeightDocumentGenerator.java b/src/main/java/io/anserini/index/generator/TermWeightDocumentGenerator.java new file mode 100644 index 0000000000..ee0f5cf875 --- /dev/null +++ b/src/main/java/io/anserini/index/generator/TermWeightDocumentGenerator.java @@ -0,0 +1,79 @@ +/* + * Anserini: A Lucene toolkit for reproducible information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.index.generator; + +import io.anserini.collection.InvalidContentsException; +import io.anserini.collection.SourceDocument; +import io.anserini.collection.SourceTermWeightDocument; +import io.anserini.index.Constants; +import io.anserini.index.IndexCollection; +import org.apache.lucene.document.*; +import org.apache.lucene.util.BytesRef; + +import java.util.Map; + +/** + * Converts a {@link SourceDocument} into a Lucene {@link Document}, ready to be indexed. + * + * @param type of the source document + */ +public class TermWeightDocumentGenerator implements LuceneDocumentGenerator { + protected IndexCollection.Args args; + + protected TermWeightDocumentGenerator() { + } + /** + * Constructor with config and counters + * + * @param args configuration arguments + */ + public TermWeightDocumentGenerator(IndexCollection.Args args) { + this.args = args; + } + + @Override + public Document createDocument(T src) throws GeneratorException { + String id = src.id(); + Map vector; + try { + vector = src.vector(); + } catch (InvalidContentsException e) { + // Catch and rethrow; indexer will eat the exception at top level and increment counters accordingly. + throw new InvalidDocumentException(); + } + + if (vector.size() == 0) { + throw new EmptyDocumentException(); + } + + // Make a new, empty document. + final Document document = new Document(); + + // Store the collection docid. + document.add(new StringField(Constants.ID, id, Field.Store.YES)); + // This is needed to break score ties by docid. + document.add(new BinaryDocValuesField(Constants.ID, new BytesRef(id))); + + if (args.storeRaw) { + document.add(new StoredField(Constants.RAW, src.raw())); + } + for (String term : vector.keySet()){ + document.add(new FeatureField(Constants.CONTENTS, term, vector.get(term))); + } + return document; + } +} diff --git a/src/main/java/io/anserini/search/SearchCollection.java b/src/main/java/io/anserini/search/SearchCollection.java index 0c25a9fcde..4951b1241b 100644 --- a/src/main/java/io/anserini/search/SearchCollection.java +++ b/src/main/java/io/anserini/search/SearchCollection.java @@ -35,6 +35,7 @@ import io.anserini.rerank.lib.Rm3Reranker; import io.anserini.rerank.lib.RocchioReranker; import io.anserini.rerank.lib.ScoreTiesAdjusterReranker; +import io.anserini.search.query.BagOfWordsQueryGenerator; import io.anserini.search.query.QueryGenerator; import io.anserini.search.query.SdmQueryGenerator; import io.anserini.search.similarity.AccurateBM25Similarity; @@ -1250,7 +1251,10 @@ public ScoredDocuments search(IndexSearcher searcher, K qid, String queryStr if (args.sdm) { query = new SdmQueryGenerator(args.sdm_tw, args.sdm_ow, args.sdm_uw).buildQuery(Constants.CONTENTS, analyzer, queryString); - } else { + } else if (args.impact){ + query = new BagOfWordsQueryGenerator().buildFeatureQuery(Constants.CONTENTS, analyzer, queryString); + } + else { QueryGenerator generator; try { generator = (QueryGenerator) Class.forName("io.anserini.search.query." + args.queryGenerator) diff --git a/src/main/java/io/anserini/search/query/BagOfWordsQueryGenerator.java b/src/main/java/io/anserini/search/query/BagOfWordsQueryGenerator.java index e1bf6e16b1..903ec37c27 100644 --- a/src/main/java/io/anserini/search/query/BagOfWordsQueryGenerator.java +++ b/src/main/java/io/anserini/search/query/BagOfWordsQueryGenerator.java @@ -17,7 +17,9 @@ package io.anserini.search.query; import io.anserini.analysis.AnalyzerUtils; +import io.anserini.index.Constants; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.document.FeatureField; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; @@ -25,6 +27,7 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.function.Function; @@ -33,7 +36,7 @@ /* * Bag of Terms query builder */ -public class BagOfWordsQueryGenerator extends QueryGenerator { +public class BagOfWordsQueryGenerator extends QueryGenerator implements FeatureGenerator { @Override public Query buildQuery(String field, Analyzer analyzer, String queryText) { List tokens = AnalyzerUtils.analyze(analyzer, queryText); @@ -47,6 +50,34 @@ public Query buildQuery(String field, Analyzer analyzer, String queryText) { return builder.build(); } + public Query buildFeatureQuery(String field, Analyzer analyzer, String queryText) { + List tokens = AnalyzerUtils.analyze(analyzer, queryText); + Map collect = tokens.stream() + .collect(Collectors.groupingBy(Function.identity(), Collectors.counting())); + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + Map normalizedScore = new HashMap<>(); + float maxWeight = 0; + for (String t : collect.keySet()){ + float s = (float) collect.get(t); + normalizedScore.put(t, s); + if (s > maxWeight) { + maxWeight = s; + } + } + // The maximum weight for FeatureQuery is 64, this constraint could be lifted but might not be necessary. + // Note: This normalization makes the scores between different queries not comparable + if (maxWeight > 64){ + for (String t : normalizedScore.keySet()){ + normalizedScore.put(t,normalizedScore.get(t)/maxWeight* (float)64.0); + } + } + + for (String t : normalizedScore.keySet()) { + builder.add(FeatureField.newLinearQuery(Constants.CONTENTS, t, normalizedScore.get(t)),BooleanClause.Occur.SHOULD); + } + return builder.build(); + } + @Override public Query buildQuery(Map fields, Analyzer analyzer, String queryText) { BooleanQuery.Builder builder = new BooleanQuery.Builder(); diff --git a/src/main/java/io/anserini/search/query/FeatureGenerator.java b/src/main/java/io/anserini/search/query/FeatureGenerator.java new file mode 100644 index 0000000000..877c1241ee --- /dev/null +++ b/src/main/java/io/anserini/search/query/FeatureGenerator.java @@ -0,0 +1,15 @@ +package io.anserini.search.query; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.search.Query; + +public interface FeatureGenerator{ + /** + * Generate queries with terms as features + * @param field + * @param analyzer + * @param queryText + * @return + */ + Query buildFeatureQuery(String field, Analyzer analyzer, String queryText); +} From eb3f19763c7e9e0c1f3d3aa59042d339e88fb4b3 Mon Sep 17 00:00:00 2001 From: Thong Nguyen Date: Mon, 27 Mar 2023 01:45:40 +0200 Subject: [PATCH 2/5] re-naminng classes --- ...nTermWeightCollection.java => JsonSparseVectorCollection.java} | 0 ...tDocumentGenerator.java => SparseVectorDocumentGenerator.java} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename src/main/java/io/anserini/collection/{JsonTermWeightCollection.java => JsonSparseVectorCollection.java} (100%) rename src/main/java/io/anserini/index/generator/{TermWeightDocumentGenerator.java => SparseVectorDocumentGenerator.java} (100%) diff --git a/src/main/java/io/anserini/collection/JsonTermWeightCollection.java b/src/main/java/io/anserini/collection/JsonSparseVectorCollection.java similarity index 100% rename from src/main/java/io/anserini/collection/JsonTermWeightCollection.java rename to src/main/java/io/anserini/collection/JsonSparseVectorCollection.java diff --git a/src/main/java/io/anserini/index/generator/TermWeightDocumentGenerator.java b/src/main/java/io/anserini/index/generator/SparseVectorDocumentGenerator.java similarity index 100% rename from src/main/java/io/anserini/index/generator/TermWeightDocumentGenerator.java rename to src/main/java/io/anserini/index/generator/SparseVectorDocumentGenerator.java From fe47e83881683aad6a3f4ac2f9a86dccc1ef78e5 Mon Sep 17 00:00:00 2001 From: Thong Nguyen Date: Mon, 27 Mar 2023 01:57:23 +0200 Subject: [PATCH 3/5] renaming class name --- .../{SourceTermWeightDocument.java => SourceSparseDocument.java} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/main/java/io/anserini/collection/{SourceTermWeightDocument.java => SourceSparseDocument.java} (100%) diff --git a/src/main/java/io/anserini/collection/SourceTermWeightDocument.java b/src/main/java/io/anserini/collection/SourceSparseDocument.java similarity index 100% rename from src/main/java/io/anserini/collection/SourceTermWeightDocument.java rename to src/main/java/io/anserini/collection/SourceSparseDocument.java From 20fab3cebf6e0ee65b552d37f347788b83579144 Mon Sep 17 00:00:00 2001 From: Thong Nguyen Date: Mon, 27 Mar 2023 02:02:38 +0200 Subject: [PATCH 4/5] update implementation --- .../JsonSparseVectorCollection.java | 25 +++++++++++-------- ...t.java => SourceSparseVectorDocument.java} | 2 +- .../SparseVectorDocumentGenerator.java | 9 ++++--- 3 files changed, 21 insertions(+), 15 deletions(-) rename src/main/java/io/anserini/collection/{SourceSparseDocument.java => SourceSparseVectorDocument.java} (95%) diff --git a/src/main/java/io/anserini/collection/JsonSparseVectorCollection.java b/src/main/java/io/anserini/collection/JsonSparseVectorCollection.java index 56e50c3b0a..303e636021 100644 --- a/src/main/java/io/anserini/collection/JsonSparseVectorCollection.java +++ b/src/main/java/io/anserini/collection/JsonSparseVectorCollection.java @@ -24,36 +24,40 @@ import java.util.HashMap; import java.util.Map; -/**xxw - * A JSON document collection where the user can specify directly the vector to be indexed. +/** + * A JSON sparse document collection for learned sparse retrieval */ -public class JsonTermWeightCollection extends DocumentCollection { - public JsonTermWeightCollection(Path path) { +public class JsonSparseVectorCollection extends DocumentCollection { + public JsonSparseVectorCollection(Path path) { this.path = path; } @Override - public FileSegment createFileSegment(BufferedReader bufferedReader) throws IOException { - return new JsonTermWeightCollection.Segment<>(bufferedReader); + public FileSegment createFileSegment(BufferedReader bufferedReader) throws IOException { + return new JsonSparseVectorCollection.Segment<>(bufferedReader); } + @Override - public FileSegment createFileSegment(Path path) throws IOException { - return new JsonTermWeightCollection.Segment<>(path); + public FileSegment createFileSegment(Path path) throws IOException { + return new JsonSparseVectorCollection.Segment<>(path); } - public static class Segment extends JsonCollection.Segment { + + public static class Segment extends JsonCollection.Segment { public Segment(Path path) throws IOException { super(path); } + public Segment(BufferedReader bufferedReader) throws IOException { super(bufferedReader); } + @Override protected Document createNewDocument(JsonNode json) { return new Document(json); } } - public static class Document extends JsonCollection.Document implements SourceTermWeightDocument { + public static class Document extends JsonCollection.Document implements SourceSparseVectorDocument { private Map vector; public Document(JsonNode json) { super(json); @@ -75,5 +79,6 @@ public Document(JsonNode json) { public Map vector() { return this.vector; } + } } diff --git a/src/main/java/io/anserini/collection/SourceSparseDocument.java b/src/main/java/io/anserini/collection/SourceSparseVectorDocument.java similarity index 95% rename from src/main/java/io/anserini/collection/SourceSparseDocument.java rename to src/main/java/io/anserini/collection/SourceSparseVectorDocument.java index 10f0671fc3..9a4681e4e2 100644 --- a/src/main/java/io/anserini/collection/SourceSparseDocument.java +++ b/src/main/java/io/anserini/collection/SourceSparseVectorDocument.java @@ -23,7 +23,7 @@ * Lucene {@link org.apache.lucene.document.Document}, which is the Lucene representation that * can be directly inserted into an index. */ -public interface SourceTermWeightDocument{ +public interface SourceSparseVectorDocument { /** * Return the vector containing term and weight * @return a map that map term to weight diff --git a/src/main/java/io/anserini/index/generator/SparseVectorDocumentGenerator.java b/src/main/java/io/anserini/index/generator/SparseVectorDocumentGenerator.java index ee0f5cf875..52e17fe002 100644 --- a/src/main/java/io/anserini/index/generator/SparseVectorDocumentGenerator.java +++ b/src/main/java/io/anserini/index/generator/SparseVectorDocumentGenerator.java @@ -18,7 +18,7 @@ import io.anserini.collection.InvalidContentsException; import io.anserini.collection.SourceDocument; -import io.anserini.collection.SourceTermWeightDocument; +import io.anserini.collection.SourceSparseVectorDocument; import io.anserini.index.Constants; import io.anserini.index.IndexCollection; import org.apache.lucene.document.*; @@ -31,17 +31,18 @@ * * @param type of the source document */ -public class TermWeightDocumentGenerator implements LuceneDocumentGenerator { +public class SparseVectorDocumentGenerator implements LuceneDocumentGenerator { protected IndexCollection.Args args; - protected TermWeightDocumentGenerator() { + protected SparseVectorDocumentGenerator() { + } /** * Constructor with config and counters * * @param args configuration arguments */ - public TermWeightDocumentGenerator(IndexCollection.Args args) { + public SparseVectorDocumentGenerator(IndexCollection.Args args) { this.args = args; } From 3bc3664ab90a6c2d950ef540b47c71639ac58922 Mon Sep 17 00:00:00 2001 From: Thong Nguyen Date: Mon, 27 Mar 2023 03:19:32 +0200 Subject: [PATCH 5/5] code formatting --- .../io/anserini/search/SearchCollection.java | 3 +-- .../anserini/search/query/FeatureGenerator.java | 16 ++++++++-------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/main/java/io/anserini/search/SearchCollection.java b/src/main/java/io/anserini/search/SearchCollection.java index 4951b1241b..fe41240a8a 100644 --- a/src/main/java/io/anserini/search/SearchCollection.java +++ b/src/main/java/io/anserini/search/SearchCollection.java @@ -1253,8 +1253,7 @@ public ScoredDocuments search(IndexSearcher searcher, K qid, String queryStr query = new SdmQueryGenerator(args.sdm_tw, args.sdm_ow, args.sdm_uw).buildQuery(Constants.CONTENTS, analyzer, queryString); } else if (args.impact){ query = new BagOfWordsQueryGenerator().buildFeatureQuery(Constants.CONTENTS, analyzer, queryString); - } - else { + } else { QueryGenerator generator; try { generator = (QueryGenerator) Class.forName("io.anserini.search.query." + args.queryGenerator) diff --git a/src/main/java/io/anserini/search/query/FeatureGenerator.java b/src/main/java/io/anserini/search/query/FeatureGenerator.java index 877c1241ee..7c6ead9bc0 100644 --- a/src/main/java/io/anserini/search/query/FeatureGenerator.java +++ b/src/main/java/io/anserini/search/query/FeatureGenerator.java @@ -4,12 +4,12 @@ import org.apache.lucene.search.Query; public interface FeatureGenerator{ - /** - * Generate queries with terms as features - * @param field - * @param analyzer - * @param queryText - * @return - */ - Query buildFeatureQuery(String field, Analyzer analyzer, String queryText); + /** + * Generate queries with terms as features + * @param field + * @param analyzer + * @param queryText + * @return + */ + Query buildFeatureQuery(String field, Analyzer analyzer, String queryText); }