Skip to content

Commit

Permalink
resolved merge conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
mehrzadshm committed Feb 1, 2025
2 parents 34f2e86 + dba7a95 commit f8be95f
Show file tree
Hide file tree
Showing 49 changed files with 1,716 additions and 1,388 deletions.
4 changes: 3 additions & 1 deletion docs/create_tasks_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,9 @@ def task_to_markdown_row(task: mteb.AbsTask) -> str:
f"[{name}]({task.metadata.reference})" if task.metadata.reference else name
)
domains = (
"[" + ", ".join(task.metadata.domains) + "]" if task.metadata.domains else ""
"[" + ", ".join(sorted(task.metadata.domains)) + "]"
if task.metadata.domains
else ""
)
n_samples = task.metadata.n_samples
dataset_statistics = round_floats_in_dict(task.metadata.descriptive_stats)
Expand Down
2,316 changes: 1,159 additions & 1,157 deletions docs/tasks.md

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion mteb/abstasks/TaskMetadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
"Written",
"Programming",
"Chemistry",
"Financial",
]

SAMPLE_CREATION_METHOD = Literal[
Expand All @@ -97,7 +98,6 @@
"Summarization",
"InstructionRetrieval",
"Speed",
"SummaryRetrieval",
]


Expand Down Expand Up @@ -173,6 +173,7 @@
"gpl-3.0",
"cdla-sharing-1.0",
"mpl-2.0",
"msr-la-nc",
"multiple",
]
)
Expand Down
37 changes: 30 additions & 7 deletions mteb/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,13 +421,12 @@ def load_results(
),
description="A curated selection of tasks coverering the Scandinavian languages; Danish, Swedish and Norwegian, including Bokmål and Nynorsk.",
reference="https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/",
citation="""@misc{enevoldsen2024scandinavian,
title={The Scandinavian Embedding Benchmarks: Comprehensive Assessment of Multilingual and Monolingual Text Embedding},
author={Kenneth Enevoldsen and Márton Kardos and Niklas Muennighoff and Kristoffer Laigaard Nielbo},
year={2024},
eprint={2406.02396},
archivePrefix={arXiv},
primaryClass={cs.CL}
citation="""@inproceedings{enevoldsen2024scandinavian,
title={The Scandinavian Embedding Benchmarks: Comprehensive Assessment of Multilingual and Monolingual Text Embedding},
author={Enevoldsen, Kenneth and Kardos, M{\'a}rton and Muennighoff, Niklas and Nielbo, Kristoffer},
booktitle={Advances in Neural Information Processing Systems},
year={2024},
url={https://nips.cc/virtual/2024/poster/97869}
}""",
contacts=["KennethEnevoldsen", "x-tabdeveloping", "Samoed"],
)
Expand Down Expand Up @@ -1142,6 +1141,30 @@ def load_results(
}""",
)


CODE_RAG = Benchmark(
name="CodeRAG",
tasks=get_tasks(
tasks=[
"CodeRAGLibraryDocumentationSolutions",
"CodeRAGOnlineTutorials",
"CodeRAGProgrammingSolutions",
"CodeRAGStackoverflowPosts",
],
),
description="A benchmark for evaluating code retrieval augmented generation, testing models' ability to retrieve relevant programming solutions, tutorials and documentation.",
reference="https://arxiv.org/abs/2406.14497",
citation="""@misc{wang2024coderagbenchretrievalaugmentcode,
title={CodeRAG-Bench: Can Retrieval Augment Code Generation?},
author={Zora Zhiruo Wang and Akari Asai and Xinyan Velocity Yu and Frank F. Xu and Yiqing Xie and Graham Neubig and Daniel Fried},
year={2024},
eprint={2406.14497},
archivePrefix={arXiv},
primaryClass={cs.SE},
url={https://arxiv.org/abs/2406.14497},
}""",
)

NANOBEIR = Benchmark(
name="NanoBEIR",
tasks=get_tasks(
Expand Down
3 changes: 2 additions & 1 deletion mteb/models/arctic_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,8 @@
# in MTEB
"NQ": ["test"],
"NQHardNegatives": ["test"],
"HotPotQA": ["test"],
"NQ-PL": ["test"],
"HotPotQA": ["test"], # translated, not trained on
"HotPotQAHardNegatives": ["test"],
"HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on)
"FEVER": ["test"],
Expand Down
1 change: 0 additions & 1 deletion mteb/models/e5_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
**E5_TRAINING_DATA,
"FEVER": ["train"],
"FEVERHardNegatives": ["train"],
"FEVER-PL": ["train"], # translation not trained on
"HotpotQA": ["train"],
"HotpotQAHardNegatives": ["train"],
"HotpotQA-PL": ["train"], # translation not trained on
Expand Down
1 change: 0 additions & 1 deletion mteb/models/e5_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,6 @@
**E5_TRAINING_DATA,
"FEVER": ["train"],
"FEVERHardNegatives": ["train"],
"FEVER-PL": ["train"], # translation not trained on
"HotpotQA": ["train"],
"HotpotQAHardNegatives": ["train"],
"HotpotQA-PL": ["train"], # translation not trained on
Expand Down
1 change: 0 additions & 1 deletion mteb/models/gritlm_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
# also uses medi2 which contains fever and hotpotqa:
"FEVER": ["train"],
"FEVERHardNegatives": ["train"],
"FEVER-PL": ["train"], # translation not trained on
"HotpotQA": ["train"],
"HotpotQAHardNegatives": ["train"],
"HotpotQA-PL": ["train"], # translation not trained on
Expand Down
1 change: 0 additions & 1 deletion mteb/models/salesforce_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ def instruction_template(
"FiQA2018-PL": ["train"],
"FEVER": ["train"],
"FEVERHardNegatives": ["train"],
"FEVER-PL": ["train"], # translation not trained on
"HotpotQA": ["train"],
"HotpotQAHardNegatives": ["train"],
"HotpotQA-PL": ["train"], # translation not trained on
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class FinancialPhrasebankClassification(AbsTaskClassification):
eval_langs=["eng-Latn"],
main_score="accuracy",
date=("2013-11-01", "2013-11-01"),
domains=["News", "Written"],
domains=["News", "Written", "Financial"],
task_subtypes=["Sentiment/Hate speech"],
license="cc-by-nc-sa-3.0",
annotations_creators="expert-annotated",
Expand Down
2 changes: 1 addition & 1 deletion mteb/tasks/Classification/kor/KorFin.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class KorFin(AbsTaskClassification):
"2022-01-01",
"2022-12-31",
), # Assumed date based on the citations in the paper
domains=["News", "Written"],
domains=["News", "Written", "Financial"],
task_subtypes=["Sentiment/Hate speech"],
license="cc-by-sa-4.0",
annotations_creators="expert-annotated",
Expand Down
14 changes: 7 additions & 7 deletions mteb/tasks/Clustering/eng/ArxivClusteringS2S.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ class ArxivClusteringS2S(AbsTaskClustering):
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="v_measure",
date=None,
domains=None,
task_subtypes=None,
license=None,
annotations_creators=None,
dialect=None,
sample_creation=None,
date=("1991-01-01", "2021-01-01"), # 1991-01-01 is the first arxiv paper
domains=["Academic", "Written"],
task_subtypes=[],
license="cc0-1.0",
annotations_creators="derived",
dialect=[],
sample_creation="found",
bibtex_citation="""@misc{arxiv_org_submitters_2024,
title={arXiv Dataset},
url={https://www.kaggle.com/dsv/7548853},
Expand Down
15 changes: 7 additions & 8 deletions mteb/tasks/Clustering/eng/RedditClustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,14 +85,13 @@ class RedditClustering(AbsTaskClustering):
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="v_measure",
date=None,
form=None,
domains=None,
task_subtypes=None,
license=None,
annotations_creators=None,
dialect=None,
sample_creation=None,
date=("2021-01-01", "2021-04-14"),
domains=["Web", "Social", "Written"],
task_subtypes=["Thematic clustering"],
license="not specified", # derived from pushshift
annotations_creators="derived",
dialect=[],
sample_creation="found",
bibtex_citation="""@article{geigle:2021:arxiv,
author = {Gregor Geigle and
Nils Reimers and
Expand Down
15 changes: 7 additions & 8 deletions mteb/tasks/Clustering/eng/RedditClusteringP2P.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,13 @@ class RedditClusteringP2P(AbsTaskClustering):
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="v_measure",
date=None,
form=None,
domains=None,
task_subtypes=None,
license=None,
annotations_creators=None,
dialect=None,
sample_creation=None,
date=("2021-01-01", "2021-04-14"),
domains=["Web", "Social", "Written"],
task_subtypes=["Thematic clustering"],
license="not specified", # derived from pushshift
annotations_creators="derived",
dialect=[],
sample_creation="found",
bibtex_citation="""@article{geigle:2021:arxiv,
author = {Gregor Geigle and
Nils Reimers and
Expand Down
15 changes: 7 additions & 8 deletions mteb/tasks/Clustering/eng/StackExchangeClustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,13 @@ class StackExchangeClustering(AbsTaskClustering):
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="v_measure",
date=None,
form=None,
domains=None,
task_subtypes=None,
license=None,
annotations_creators=None,
dialect=None,
sample_creation=None,
date=("2021-01-01", "2021-04-14"),
domains=["Web", "Written"],
task_subtypes=["Thematic clustering"],
license="not specified",
annotations_creators="derived",
dialect=[],
sample_creation="found",
bibtex_citation="""@article{geigle:2021:arxiv,
author = {Gregor Geigle and
Nils Reimers and
Expand Down
14 changes: 7 additions & 7 deletions mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,13 +91,13 @@ class StackExchangeClusteringP2P(AbsTaskClustering):
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="v_measure",
date=None,
domains=None,
task_subtypes=None,
license=None,
annotations_creators=None,
dialect=None,
sample_creation=None,
date=("2021-01-01", "2021-04-14"),
domains=["Web", "Written"],
task_subtypes=["Thematic clustering"],
license="not specified",
annotations_creators="derived",
dialect=[],
sample_creation="found",
bibtex_citation="""@article{geigle:2021:arxiv,
author = {Gregor Geigle and
Nils Reimers and
Expand Down
12 changes: 6 additions & 6 deletions mteb/tasks/PairClassification/eng/TwitterSemEval2015PC.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@ class TwitterSemEval2015PC(AbsTaskPairClassification):
eval_langs=["eng-Latn"],
main_score="max_ap",
date=None,
domains=None,
task_subtypes=None,
license=None,
annotations_creators=None,
dialect=None,
sample_creation=None,
domains=["Social", "Written"],
task_subtypes=[],
license="not specified",
annotations_creators="human-annotated",
dialect=[],
sample_creation="found",
bibtex_citation="""@inproceedings{xu-etal-2015-semeval,
title = "{S}em{E}val-2015 Task 1: Paraphrase and Semantic Similarity in {T}witter ({PIT})",
author = "Xu, Wei and
Expand Down
12 changes: 6 additions & 6 deletions mteb/tasks/PairClassification/eng/TwitterURLCorpusPC.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@ class TwitterURLCorpusPC(AbsTaskPairClassification):
eval_langs=["eng-Latn"],
main_score="max_ap",
date=None,
domains=None,
task_subtypes=None,
license=None,
annotations_creators=None,
dialect=None,
sample_creation=None,
domains=["Social", "Written"],
task_subtypes=[],
license="not specified",
annotations_creators="derived",
dialect=[],
sample_creation="found",
bibtex_citation="""@inproceedings{lan-etal-2017-continuously,
title = "A Continuously Growing Dataset of Sentential Paraphrases",
author = "Lan, Wuwei and
Expand Down
8 changes: 4 additions & 4 deletions mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@ class AskUbuntuDupQuestions(AbsTaskReranking):
eval_langs=["eng-Latn"],
main_score="map",
date=None,
domains=None,
domains=["Programming", "Web"],
task_subtypes=None,
license=None,
annotations_creators=None,
dialect=None,
sample_creation=None,
annotations_creators="human-annotated",
dialect=[],
sample_creation="found",
prompt="Retrieve duplicate questions from AskUbuntu forum",
bibtex_citation="""@article{wang-2021-TSDAE,
title = "TSDAE: Using Transformer-based Sequential Denoising Auto-Encoderfor Unsupervised Sentence Embedding Learning",
Expand Down
14 changes: 7 additions & 7 deletions mteb/tasks/Reranking/eng/StackOverflowDupQuestions.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@ class StackOverflowDupQuestions(AbsTaskReranking):
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="map",
date=None,
domains=None,
task_subtypes=None,
license=None,
annotations_creators=None,
dialect=None,
sample_creation=None,
date=("2014-01-21", "2018-01-01"),
domains=["Written", "Blog", "Programming"],
task_subtypes=["Question answering"],
license="cc-by-nc-sa-4.0",
annotations_creators="derived",
dialect=[],
sample_creation="found",
prompt="Retrieve duplicate questions from StackOverflow forum",
bibtex_citation="""@article{Liu2018LinkSOAD,
title={LinkSO: a dataset for learning to retrieve similar question answer pairs on software development forums},
Expand Down
1 change: 1 addition & 0 deletions mteb/tasks/Retrieval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .code.CodeEditSearchRetrieval import *
from .code.CodeFeedbackMTRetrieval import *
from .code.CodeFeedbackSTRetrieval import *
from .code.CodeRAG import *
from .code.CodeSearchNetCCRetrieval import *
from .code.CodeSearchNetRetrieval import *
from .code.CodeTransOceanContestRetrieval import *
Expand Down
Loading

0 comments on commit f8be95f

Please sign in to comment.