nimasadri11 · crystina-z · Jan 26, 2022 · Jan 26, 2022
diff --git a/capreolus/benchmark/__init__.py b/capreolus/benchmark/__init__.py
@@ -16,8 +16,8 @@
 
 def validate(build_f):
     def validate_folds_file(self):
-        if not hasattr(self, "fold_file"):
-            logger.warning(f"Folds file is not found for Module {self.module_name}")
+        if not hasattr(self, "fold_file") or (self.fold_file is None):
+            logger.warning(f"Folds file is not found or undefined for Module {self.module_name}")
             return
 
         if self.fold_file.suffix != ".json":
@@ -35,8 +35,8 @@ def validate_folds_file(self):
         logger.info("Folds file validation finishes.")
 
     def validate_qrels_file(self):
-        if not hasattr(self, "qrel_file"):
-            logger.warning(f"Qrel file is not found for Module {self.module_name}")
+        if not hasattr(self, "qrel_file") or (self.qrel_file is None):
+            logger.warning(f"Qrel file is not found or undefined for Module {self.module_name}")
             return
 
         n_dup, qrels = 0, defaultdict(dict)
@@ -219,6 +219,11 @@ def ird_load_qrels(self):
         qrels = {}
         for name in self.ird_dataset_names:
             dataset = ir_datasets.load(name)
+
+            if not hasattr(dataset, "qrels_iter"):
+                logger.warning(f"Dataset {name} has no available qrels.")
+                continue
+
             for qrel in dataset.qrels_iter():
                 qrels.setdefault(qrel.query_id, {})
                 qrels[qrel.query_id][qrel.doc_id] = max(qrel.relevance, qrels[qrel.query_id].get(qrel.doc_id, -1))

diff --git a/capreolus/benchmark/msmarco_v2.py b/capreolus/benchmark/msmarco_v2.py
@@ -0,0 +1,25 @@
+import os
+import json
+from collections import defaultdict
+
+
+from capreolus import constants, Dependency, constants
+from capreolus.utils.loginit import get_logger
+
+from . import Benchmark, IRDBenchmark
+
+logger = get_logger(__name__)
+PACKAGE_PATH = constants["PACKAGE_PATH"]
+
+
+@Benchmark.register
+class MSMARCOPassageV2(IRDBenchmark):
+    """
+    Qrels and training set data for MS MARCO Passage v2: https://microsoft.github.io/msmarco/TREC-Deep-Learning.html#passage-ranking-dataset
+    """
+
+    module_name = "mspsg_v2"
+    query_type = "text"
+    ird_dataset_names = ["msmarco-passage-v2/train", "msmarco-passage-v2/dev1", "msmarco-passage-v2/trec-dl-2021"]
+    dependencies = [Dependency(key="collection", module="collection", name="mspsg_v2")]
+    fold_file = PACKAGE_PATH / "data" / "msmarcov2_passage_title_folds.json"
diff --git a/capreolus/collection/msmarco_v2.py b/capreolus/collection/msmarco_v2.py
@@ -0,0 +1,26 @@
+import json
+
+from capreolus import constants
+from capreolus.utils.loginit import get_logger
+
+from . import Collection, IRDCollection
+
+logger = get_logger(__name__)
+PACKAGE_PATH = constants["PACKAGE_PATH"]
+
+
+@Collection.register
+class MSMARCO_PSG_V2(IRDCollection):
+    """
+    Collection for MS MARCO Passage v2: https://microsoft.github.io/msmarco/TREC-Deep-Learning.html#passage-ranking-dataset
+    """
+
+    module_name = "mspsg_v2"
+    ird_dataset_name = "msmarco-passage-v2"
+    collection_type = "JsonCollection"
+
+    def doc_as_json(self, doc):
+        return json.dumps({"id": doc.doc_id, "contents": doc.text})
+
+    def get_doc(self, docid):
+        return self.docs_store.get(docid).text
diff --git a/capreolus/data/msmarcov2_passage_title_folds.json b/capreolus/data/msmarcov2_passage_title_folds.json
diff --git a/capreolus/searcher/anserini.py b/capreolus/searcher/anserini.py
@@ -40,6 +40,7 @@ def _anserini_query_from_file(self, topicsfn, anserini_param_str, output_base_pa
 
         index_path = self.index.get_index_path()
         anserini_fat_jar = Anserini.get_fat_jar()
+        topic_reader = self.config["topicreader"]
         cmd = [
             "java",
             "-classpath",
@@ -49,7 +50,7 @@ def _anserini_query_from_file(self, topicsfn, anserini_param_str, output_base_pa
             "-Dapp.name=SearchCollection",
             "io.anserini.search.SearchCollection",
             "-topicreader",
-            "TsvString",
+            topic_reader,
             "-index",
             index_path,
             "-topics",
@@ -156,6 +157,7 @@ class BM25(AnseriniSearcherMixIn, Searcher):
         ConfigOption("k1", 0.9, "controls term saturation", value_type="floatlist"),
         ConfigOption("b", 0.4, "controls document length normalization", value_type="floatlist"),
         ConfigOption("hits", 1000, "number of results to return"),
+        ConfigOption("topicreader", "TsvString", "which topicreader to use in anserini"),
     ]
 
     def _query_from_file(self, topicsfn, output_path, config):
@@ -185,6 +187,7 @@ class BM25Grid(AnseriniSearcherMixIn, Searcher):
         ConfigOption("k1max", 1.0, "maximum k1 value to include in grid search (starting at 0.1)"),
         ConfigOption("bmax", 1.0, "maximum b value to include in grid search (starting at 0.1)"),
         ConfigOption("hits", 1000, "number of results to return"),
+        ConfigOption("topicreader", "TsvString", "which topicreader to use in anserini"),
     ]
 
     def _query_from_file(self, topicsfn, output_path, config):
@@ -212,6 +215,7 @@ class BM25RM3(AnseriniSearcherMixIn, Searcher):
         ConfigOption("fbDocs", [5, 10], "number of documents used for feedback", value_type="intlist"),
         ConfigOption("originalQueryWeight", [0.5], "the weight of unexpended query", value_type="floatlist"),
         ConfigOption("hits", 1000, "number of results to return"),
+        ConfigOption("topicreader", "TsvString", "which topicreader to use in anserini"),
     ]
 
     def _query_from_file(self, topicsfn, output_path, config):
@@ -239,6 +243,7 @@ class BM25PostProcess(BM25, PostprocessMixin):
         ConfigOption("hits", 1000, "number of results expected from the core searcher"),
         ConfigOption("topn", 1000, "number of results expected after the filtering (if any)"),
         ConfigOption("dedup", False),
+        ConfigOption("topicreader", "TsvString", "which topicreader to use in anserini"),
     ]
 
     def query_from_file(self, topicsfn, output_path, docs_to_remove=None):
@@ -357,6 +362,7 @@ class BM25PRF(AnseriniSearcherMixIn, Searcher):
         ConfigOption("fbDocs", [5, 10, 15], "number of documents used for feedback", value_type="intlist"),
         ConfigOption("newTermWeight", [0.2, 0.25], value_type="floatlist"),
         ConfigOption("hits", 1000, "number of results to return"),
+        ConfigOption("topicreader", "TsvString", "which topicreader to use in anserini"),
     ]
 
     def _query_from_file(self, topicsfn, output_path, config):
@@ -388,6 +394,7 @@ class AxiomaticSemanticMatching(AnseriniSearcherMixIn, Searcher):
         ConfigOption("beta", 0.4, value_type="floatlist"),
         ConfigOption("top", 20, value_type="intlist"),
         ConfigOption("hits", 1000, "number of results to return"),
+        ConfigOption("topicreader", "TsvString", "which topicreader to use in anserini"),
     ]
 
     def _query_from_file(self, topicsfn, output_path, config):
@@ -411,6 +418,7 @@ class DirichletQL(AnseriniSearcherMixIn, Searcher):
     config_spec = [
         ConfigOption("mu", 1000, "smoothing parameter", value_type="intlist"),
         ConfigOption("hits", 1000, "number of results to return"),
+        ConfigOption("topicreader", "TsvString", "which topicreader to use in anserini"),
     ]
 
     def _query_from_file(self, topicsfn, output_path, config):
@@ -436,7 +444,11 @@ class QLJM(AnseriniSearcherMixIn, Searcher):
     """Anserini QL with Jelinek-Mercer smoothing. This searcher's parameters can also be specified as lists indicating parameters to grid search (e.g., ``"0.4,0.6,0.8,1.0"`` or ``"0.4..1,0.2"``)."""
 
     module_name = "QLJM"
-    config_spec = [ConfigOption("lam", 0.1, value_type="floatlist"), ConfigOption("hits", 1000, "number of results to return")]
+    config_spec = [
+        ConfigOption("lam", 0.1, value_type="floatlist"),
+        ConfigOption("hits", 1000, "number of results to return"),
+        ConfigOption("topicreader", "TsvString", "which topicreader to use in anserini"),
+    ]
 
     def _query_from_file(self, topicsfn, output_path, config):
         anserini_param_str = "-qljm -qljm.lambda {0} -hits {1}".format(list2str(config["lam"], delimiter=" "), config["hits"])
@@ -454,6 +466,7 @@ class INL2(AnseriniSearcherMixIn, Searcher):
     config_spec = [
         ConfigOption("c", 0.1),  # array input of this parameter is not support by anserini.SearchCollection
         ConfigOption("hits", 1000, "number of results to return"),
+        ConfigOption("topicreader", "TsvString", "which topicreader to use in anserini"),
     ]
 
     def _query_from_file(self, topicsfn, output_path, config):
@@ -472,6 +485,7 @@ class SPL(AnseriniSearcherMixIn, Searcher):
     config_spec = [
         ConfigOption("c", 0.1),  # array input of this parameter is not support by anserini.SearchCollection
         ConfigOption("hits", 1000, "number of results to return"),
+        ConfigOption("topicreader", "TsvString", "which topicreader to use in anserini"),
     ]
 
     def _query_from_file(self, topicsfn, output_path, config):
@@ -492,6 +506,7 @@ class F2Exp(AnseriniSearcherMixIn, Searcher):
     config_spec = [
         ConfigOption("s", 0.5),  # array input of this parameter is not support by anserini.SearchCollection
         ConfigOption("hits", 1000, "number of results to return"),
+        ConfigOption("topicreader", "TsvString", "which topicreader to use in anserini"),
     ]
 
     def _query_from_file(self, topicsfn, output_path, config):
@@ -512,6 +527,7 @@ class F2Log(AnseriniSearcherMixIn, Searcher):
     config_spec = [
         ConfigOption("s", 0.5),  # array input of this parameter is not support by anserini.SearchCollection
         ConfigOption("hits", 1000, "number of results to return"),
+        ConfigOption("topicreader", "TsvString", "which topicreader to use in anserini"),
     ]
 
     def _query_from_file(self, topicsfn, output_path, config):
@@ -537,6 +553,7 @@ class SDM(AnseriniSearcherMixIn, Searcher):
         ConfigOption("ow", 0.15, "ordered window weight"),
         ConfigOption("uw", 0.05, "unordered window weight"),
         ConfigOption("hits", 1000, "number of results to return"),
+        ConfigOption("topicreader", "TsvString", "which topicreader to use in anserini"),
     ]
 
     def _query_from_file(self, topicsfn, output_path, config):