diff --git a/ir_datasets/datasets/trec_tot.py b/ir_datasets/datasets/trec_tot.py index f7c8af6..8dfb545 100644 --- a/ir_datasets/datasets/trec_tot.py +++ b/ir_datasets/datasets/trec_tot.py @@ -22,6 +22,26 @@ def default_text(self): """ return self.page_title + ' ' + self.text +class TipOfTheTongueDoc2024(NamedTuple): + doc_id: str + title: str + wikidata_id: str + text: str + sections: Dict[str, str] + + def default_text(self): + """ + We use the title and text of the TipOfTheTongueQuery as default_text because that is everything available for users who want to respond to such an information need. + """ + return self.title + ' ' + self.text + +class TipOfTheTongueQuery2024(NamedTuple): + query_id: str + query: str + + def default_text(self): + return self.query + class TipOfTheTongueQuery(NamedTuple): query_id: str @@ -44,7 +64,7 @@ def _init(): dlc = DownloadConfig.context(NAME, base_path) subsets = {} - main_dlc = dlc['main'] + main_dlc = dlc['2023'] base = Dataset( documentation('_'), ) @@ -55,6 +75,7 @@ def _init(): docs_2023_handler, documentation('2023'), ) + ir_datasets.registry.register(f'{NAME}/2023', subsets['2023']) for s in ['train', 'dev']: subsets[f'2023/{s}'] = Dataset( @@ -65,6 +86,23 @@ def _init(): ) ir_datasets.registry.register(f'{NAME}/2023/{s}', subsets[f'2023/{s}']) + main_dlc = dlc['2024'] + + docs_2024_handler = JsonlDocs(Cache(ZipExtract(main_dlc, 'corpus.jsonl'), base_path/'2024/corpus.jsonl'), doc_cls=TipOfTheTongueDoc2024, lang='en') + subsets['2024'] = Dataset( + docs_2024_handler, + documentation('2024'), + ) + ir_datasets.registry.register(f'{NAME}/2024', subsets['2024']) + for s in ['test']: + subsets[f'2024/{s}'] = Dataset( + docs_2024_handler, + JsonlQueries(Cache(ZipExtract(dlc[f'2024-{s}'], f'{s}-2024/queries.jsonl'), base_path/f'2024/{s}-2024/queries.jsonl'), query_cls=TipOfTheTongueQuery2024, lang='en'), + documentation(f'2024/{s}'), + ) + ir_datasets.registry.register(f'{NAME}/2024/{s}', subsets[f'2024/{s}']) + + return base, subsets diff --git a/ir_datasets/docs/trec-tot.yaml b/ir_datasets/docs/trec-tot.yaml index 1633c54..196fed5 100644 --- a/ir_datasets/docs/trec-tot.yaml +++ b/ir_datasets/docs/trec-tot.yaml @@ -26,3 +26,17 @@ Train query set for TREC 2023 tip-of-the-tongue search track. Dev query set for TREC 2023 tip-of-the-tongue search track.
' + +2024: + desc: ' ++Corpus for the TREC 2024 tip-of-the-tongue search track. +
+' + +2024/test: + desc: ' ++Test query set for TREC 2024 tip-of-the-tongue search track. +
+' diff --git a/ir_datasets/etc/downloads.json b/ir_datasets/etc/downloads.json index 397646a..3e59d1d 100644 --- a/ir_datasets/etc/downloads.json +++ b/ir_datasets/etc/downloads.json @@ -5943,10 +5943,20 @@ }, "trec-tot": { - "main": { + "2023": { "url": "https://surfdrive.surf.nl/files/index.php/s/FaEK4xc6Xp2JcAJ/download", "expected_md5": "f84fe82cb80e3ee1072576c8d6c4a417", "cache_path": "trec-tot.zip" + }, + "2024": { + "url": "https://zenodo.org/records/13370657/files/corpus.jsonl.zip?download=1", + "expected_md5": "4ea86770817e46a06fea5c94f596409c", + "cache_path": "trec-tot-2024-corpus.zip" + }, + "2024-test": { + "url": "https://zenodo.org/records/13370657/files/test-2024.zip?download=1", + "expected_md5": "3d0a4d83957ee6a1398afefbc96162fa", + "cache_path": "trec-tot-2024-queries.zip" } }, diff --git a/ir_datasets/etc/metadata.json b/ir_datasets/etc/metadata.json index 60bd7c8..c291aed 100644 --- a/ir_datasets/etc/metadata.json +++ b/ir_datasets/etc/metadata.json @@ -659,6 +659,7 @@ "trec-spanish/trec4": {"docs": {"_ref": "trec-spanish"}, "queries": {"count": 25}, "qrels": {"count": 13109, "fields": {"relevance": {"counts_by_value": {"1": 2202, "0": 10907}}}}}, "trec-tot": {}, "trec-tot/2023": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, + "trec-tot/2024": {"docs": {"count": 3185450, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, "trec-tot/2023/dev": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 150}, "qrels": {"count": 150, "fields": {"relevance": {"counts_by_value": {"1": 150}}}}}, "trec-tot/2023/train": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 150}, "qrels": {"count": 150, "fields": {"relevance": {"counts_by_value": {"1": 150}}}}}, "tripclick": {"docs": {"count": 1523878, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, diff --git a/test/integration/trec_tot_2024.py b/test/integration/trec_tot_2024.py new file mode 100644 index 0000000..5a4a43c --- /dev/null +++ b/test/integration/trec_tot_2024.py @@ -0,0 +1,24 @@ +import re +import unittest +from ir_datasets.formats import TrecQrel +from ir_datasets.datasets.trec_tot import TipOfTheTongueDoc2024, TipOfTheTongueQuery2024 +from test.integration.base import DatasetIntegrationTest +import ir_datasets + + +class TestTipOfTheTongue(DatasetIntegrationTest): + def test_tip_of_the_tongue_docs(self): + self._test_docs('trec-tot/2024', count=3185450, items={ + 0: TipOfTheTongueDoc2024("846", "Museum of Work", "Q6941060", re.compile("^The Museum of Work .*"), [{"start": 0, "end": 798, "section": "Abstract"}, {"start": 798, "end": 1620, "section": "Overview"}, {"start": 1620, "end": 3095, "section": "Exhibitions"}, {"start": 3095, "end": 3371, "section": "The history of Alva"}, {"start": 3371, "end": 3824, "section": "Industriland"}, {"start": 3824, "end": 4371, "section": "Framtidsland (Future country)"}, {"start": 4371, "end": 4761, "section": "EWK \u2014 The Center for Political Illustration Art"}]), + 1091: TipOfTheTongueDoc2024("9764", "Emma Goldman", "Q79969", re.compile("Emma Goldman \\(June 27, 1869 .*"),[{"start": 0, "end": 2752, "section": "Abstract"}, {"start": 2752, "end": 45613, "section": "Biography"}, {"start": 45613, "end": 47371, "section": "Family"}, {"start": 47371, "end": 50317, "section": "Adolescence"}, {"start": 50317, "end": 52433, "section": "Rochester, New York"}, {"start": 52433, "end": 54427, "section": "Most and Berkman"}, {"start": 54427, "end": 57448, "section": "Homestead plot"}, {"start": 57448, "end": 60672, "section": "\"Inciting to riot\""}, {"start": 60672, "end": 63288, "section": "McKinley assassination"}, {"start": 63288, "end": 66975, "section": "''Mother Earth'' and Berkman's release"}, {"start": 66975, "end": 69914, "section": "Reitman, essays, and birth control"}, {"start": 69914, "end": 73788, "section": "World War I"}, {"start": 73788, "end": 76344, "section": "Deportation"}, {"start": 76344, "end": 79375, "section": "Russia"}, {"start": 79375, "end": 83782, "section": "England, Canada, and France"}, {"start": 83782, "end": 86917, "section": "Spanish Civil War"}, {"start": 86917, "end": 87430, "section": "Final years"}, {"start": 87430, "end": 88493, "section": "Death"}, {"start": 88493, "end": 101764, "section": "Philosophy"}, {"start": 101764, "end": 106976, "section": "Anarchism"}, {"start": 106976, "end": 109922, "section": "Tactical uses of violence"}, {"start": 109922, "end": 111036, "section": "Capitalism and labor"}, {"start": 111036, "end": 114245, "section": "State"}, {"start": 114245, "end": 116281, "section": "Feminism and sexuality"}, {"start": 116281, "end": 117248, "section": "Atheism"}, {"start": 117248, "end": 120736, "section": "Legacy"}, {"start": 120736, "end": 120977, "section": "Works"}]) + }) + + def test_tip_of_the_tongue_queries(self): + self._test_queries('trec-tot/2024/test', count=600, items={ + 0: TipOfTheTongueQuery2024("2001", re.compile("^I remember this old building I used to pass by in the heart of a bustling financial district, a place where the air always seemed thick.*")), + 599: TipOfTheTongueQuery2024("2600", re.compile("^Okay, this is a vague one .\n So I know this is going to be.*")) + }) + +if __name__ == '__main__': + unittest.main() +