Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add the TREC 2024 Tip-of-the-Tongue Dataset #272

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 39 additions & 1 deletion ir_datasets/datasets/trec_tot.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,26 @@ def default_text(self):
"""
return self.page_title + ' ' + self.text

class TipOfTheTongueDoc2024(NamedTuple):
doc_id: str
title: str
wikidata_id: str
text: str
sections: Dict[str, str]

def default_text(self):
"""
We use the title and text of the TipOfTheTongueQuery as default_text because that is everything available for users who want to respond to such an information need.
"""
return self.title + ' ' + self.text

class TipOfTheTongueQuery2024(NamedTuple):
query_id: str
query: str

def default_text(self):
return self.query


class TipOfTheTongueQuery(NamedTuple):
query_id: str
Expand All @@ -44,7 +64,7 @@ def _init():
dlc = DownloadConfig.context(NAME, base_path)
subsets = {}

main_dlc = dlc['main']
main_dlc = dlc['2023']
base = Dataset(
documentation('_'),
)
Expand All @@ -55,6 +75,7 @@ def _init():
docs_2023_handler,
documentation('2023'),
)

ir_datasets.registry.register(f'{NAME}/2023', subsets['2023'])
for s in ['train', 'dev']:
subsets[f'2023/{s}'] = Dataset(
Expand All @@ -65,6 +86,23 @@ def _init():
)
ir_datasets.registry.register(f'{NAME}/2023/{s}', subsets[f'2023/{s}'])

main_dlc = dlc['2024']

docs_2024_handler = JsonlDocs(Cache(ZipExtract(main_dlc, 'corpus.jsonl'), base_path/'2024/corpus.jsonl'), doc_cls=TipOfTheTongueDoc2024, lang='en')
subsets['2024'] = Dataset(
docs_2024_handler,
documentation('2024'),
)
ir_datasets.registry.register(f'{NAME}/2024', subsets['2024'])
for s in ['test']:
subsets[f'2024/{s}'] = Dataset(
docs_2024_handler,
JsonlQueries(Cache(ZipExtract(dlc[f'2024-{s}'], f'{s}-2024/queries.jsonl'), base_path/f'2024/{s}-2024/queries.jsonl'), query_cls=TipOfTheTongueQuery2024, lang='en'),
documentation(f'2024/{s}'),
)
ir_datasets.registry.register(f'{NAME}/2024/{s}', subsets[f'2024/{s}'])


return base, subsets


Expand Down
14 changes: 14 additions & 0 deletions ir_datasets/docs/trec-tot.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,17 @@ Train query set for TREC 2023 tip-of-the-tongue search track.
Dev query set for TREC 2023 tip-of-the-tongue search track.
</p>
'

2024:
desc: '
<p>
Corpus for the TREC 2024 tip-of-the-tongue search track.
</p>
'

2024/test:
desc: '
<p>
Test query set for TREC 2024 tip-of-the-tongue search track.
</p>
'
12 changes: 11 additions & 1 deletion ir_datasets/etc/downloads.json
Original file line number Diff line number Diff line change
Expand Up @@ -5943,10 +5943,20 @@
},

"trec-tot": {
"main": {
"2023": {
"url": "https://surfdrive.surf.nl/files/index.php/s/FaEK4xc6Xp2JcAJ/download",
"expected_md5": "f84fe82cb80e3ee1072576c8d6c4a417",
"cache_path": "trec-tot.zip"
},
"2024": {
"url": "https://zenodo.org/records/13370657/files/corpus.jsonl.zip?download=1",
"expected_md5": "4ea86770817e46a06fea5c94f596409c",
"cache_path": "trec-tot-2024-corpus.zip"
},
"2024-test": {
"url": "https://zenodo.org/records/13370657/files/test-2024.zip?download=1",
"expected_md5": "3d0a4d83957ee6a1398afefbc96162fa",
"cache_path": "trec-tot-2024-queries.zip"
}
},

Expand Down
1 change: 1 addition & 0 deletions ir_datasets/etc/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,7 @@
"trec-spanish/trec4": {"docs": {"_ref": "trec-spanish"}, "queries": {"count": 25}, "qrels": {"count": 13109, "fields": {"relevance": {"counts_by_value": {"1": 2202, "0": 10907}}}}},
"trec-tot": {},
"trec-tot/2023": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
"trec-tot/2024": {"docs": {"count": 3185450, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
"trec-tot/2023/dev": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 150}, "qrels": {"count": 150, "fields": {"relevance": {"counts_by_value": {"1": 150}}}}},
"trec-tot/2023/train": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 150}, "qrels": {"count": 150, "fields": {"relevance": {"counts_by_value": {"1": 150}}}}},
"tripclick": {"docs": {"count": 1523878, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
Expand Down
24 changes: 24 additions & 0 deletions test/integration/trec_tot_2024.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import re
import unittest
from ir_datasets.formats import TrecQrel
from ir_datasets.datasets.trec_tot import TipOfTheTongueDoc2024, TipOfTheTongueQuery2024
from test.integration.base import DatasetIntegrationTest
import ir_datasets


class TestTipOfTheTongue(DatasetIntegrationTest):
def test_tip_of_the_tongue_docs(self):
self._test_docs('trec-tot/2024', count=3185450, items={
0: TipOfTheTongueDoc2024("846", "Museum of Work", "Q6941060", re.compile("^The Museum of Work .*"), [{"start": 0, "end": 798, "section": "Abstract"}, {"start": 798, "end": 1620, "section": "Overview"}, {"start": 1620, "end": 3095, "section": "Exhibitions"}, {"start": 3095, "end": 3371, "section": "The history of Alva"}, {"start": 3371, "end": 3824, "section": "Industriland"}, {"start": 3824, "end": 4371, "section": "Framtidsland (Future country)"}, {"start": 4371, "end": 4761, "section": "EWK \u2014 The Center for Political Illustration Art"}]),
1091: TipOfTheTongueDoc2024("9764", "Emma Goldman", "Q79969", re.compile("Emma Goldman \\(June 27, 1869 .*"),[{"start": 0, "end": 2752, "section": "Abstract"}, {"start": 2752, "end": 45613, "section": "Biography"}, {"start": 45613, "end": 47371, "section": "Family"}, {"start": 47371, "end": 50317, "section": "Adolescence"}, {"start": 50317, "end": 52433, "section": "Rochester, New York"}, {"start": 52433, "end": 54427, "section": "Most and Berkman"}, {"start": 54427, "end": 57448, "section": "Homestead plot"}, {"start": 57448, "end": 60672, "section": "\"Inciting to riot\""}, {"start": 60672, "end": 63288, "section": "McKinley assassination"}, {"start": 63288, "end": 66975, "section": "''Mother Earth'' and Berkman's release"}, {"start": 66975, "end": 69914, "section": "Reitman, essays, and birth control"}, {"start": 69914, "end": 73788, "section": "World War I"}, {"start": 73788, "end": 76344, "section": "Deportation"}, {"start": 76344, "end": 79375, "section": "Russia"}, {"start": 79375, "end": 83782, "section": "England, Canada, and France"}, {"start": 83782, "end": 86917, "section": "Spanish Civil War"}, {"start": 86917, "end": 87430, "section": "Final years"}, {"start": 87430, "end": 88493, "section": "Death"}, {"start": 88493, "end": 101764, "section": "Philosophy"}, {"start": 101764, "end": 106976, "section": "Anarchism"}, {"start": 106976, "end": 109922, "section": "Tactical uses of violence"}, {"start": 109922, "end": 111036, "section": "Capitalism and labor"}, {"start": 111036, "end": 114245, "section": "State"}, {"start": 114245, "end": 116281, "section": "Feminism and sexuality"}, {"start": 116281, "end": 117248, "section": "Atheism"}, {"start": 117248, "end": 120736, "section": "Legacy"}, {"start": 120736, "end": 120977, "section": "Works"}])
})

def test_tip_of_the_tongue_queries(self):
self._test_queries('trec-tot/2024/test', count=600, items={
0: TipOfTheTongueQuery2024("2001", re.compile("^I remember this old building I used to pass by in the heart of a bustling financial district, a place where the air always seemed thick.*")),
599: TipOfTheTongueQuery2024("2600", re.compile("^Okay, this is a vague one .\n So I know this is going to be.*"))
})

if __name__ == '__main__':
unittest.main()