resolved merge conflicts

embeddings-benchmark · Feb 1, 2025 · f8be95f · f8be95f
2 parents 34f2e86 + dba7a95
commit f8be95f
Show file tree

Hide file tree

Showing 49 changed files with 1,716 additions and 1,388 deletions.
diff --git a/docs/create_tasks_table.py b/docs/create_tasks_table.py
@@ -50,7 +50,9 @@ def task_to_markdown_row(task: mteb.AbsTask) -> str:
         f"[{name}]({task.metadata.reference})" if task.metadata.reference else name
     )
     domains = (
-        "[" + ", ".join(task.metadata.domains) + "]" if task.metadata.domains else ""
+        "[" + ", ".join(sorted(task.metadata.domains)) + "]"
+        if task.metadata.domains
+        else ""
     )
     n_samples = task.metadata.n_samples
     dataset_statistics = round_floats_in_dict(task.metadata.descriptive_stats)

diff --git a/docs/tasks.md b/docs/tasks.md
diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py
@@ -72,6 +72,7 @@
     "Written",
     "Programming",
     "Chemistry",
+    "Financial",
 ]
 
 SAMPLE_CREATION_METHOD = Literal[
@@ -97,7 +98,6 @@
     "Summarization",
     "InstructionRetrieval",
     "Speed",
-    "SummaryRetrieval",
 ]
 
 
@@ -173,6 +173,7 @@
         "gpl-3.0",
         "cdla-sharing-1.0",
         "mpl-2.0",
+        "msr-la-nc",
         "multiple",
     ]
 )

diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py
@@ -421,13 +421,12 @@ def load_results(
     ),
     description="A curated selection of tasks coverering the Scandinavian languages; Danish, Swedish and Norwegian, including Bokmål and Nynorsk.",
     reference="https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/",
-    citation="""@misc{enevoldsen2024scandinavian,
-      title={The Scandinavian Embedding Benchmarks: Comprehensive Assessment of Multilingual and Monolingual Text Embedding}, 
-      author={Kenneth Enevoldsen and Márton Kardos and Niklas Muennighoff and Kristoffer Laigaard Nielbo},
-      year={2024},
-      eprint={2406.02396},
-      archivePrefix={arXiv},
-      primaryClass={cs.CL}
+    citation="""@inproceedings{enevoldsen2024scandinavian,
+  title={The Scandinavian Embedding Benchmarks: Comprehensive Assessment of Multilingual and Monolingual Text Embedding},
+  author={Enevoldsen, Kenneth and Kardos, M{\'a}rton and Muennighoff, Niklas and Nielbo, Kristoffer},
+  booktitle={Advances in Neural Information Processing Systems},
+  year={2024},
+  url={https://nips.cc/virtual/2024/poster/97869}
 }""",
     contacts=["KennethEnevoldsen", "x-tabdeveloping", "Samoed"],
 )
@@ -1142,6 +1141,30 @@ def load_results(
 }""",
 )
 
+
+CODE_RAG = Benchmark(
+    name="CodeRAG",
+    tasks=get_tasks(
+        tasks=[
+            "CodeRAGLibraryDocumentationSolutions",
+            "CodeRAGOnlineTutorials",
+            "CodeRAGProgrammingSolutions",
+            "CodeRAGStackoverflowPosts",
+        ],
+    ),
+    description="A benchmark for evaluating code retrieval augmented generation, testing models' ability to retrieve relevant programming solutions, tutorials and documentation.",
+    reference="https://arxiv.org/abs/2406.14497",
+    citation="""@misc{wang2024coderagbenchretrievalaugmentcode,
+      title={CodeRAG-Bench: Can Retrieval Augment Code Generation?}, 
+      author={Zora Zhiruo Wang and Akari Asai and Xinyan Velocity Yu and Frank F. Xu and Yiqing Xie and Graham Neubig and Daniel Fried},
+      year={2024},
+      eprint={2406.14497},
+      archivePrefix={arXiv},
+      primaryClass={cs.SE},
+      url={https://arxiv.org/abs/2406.14497}, 
+    }""",
+)
+
 NANOBEIR = Benchmark(
     name="NanoBEIR",
     tasks=get_tasks(

diff --git a/mteb/models/arctic_models.py b/mteb/models/arctic_models.py
@@ -110,7 +110,8 @@
         # in MTEB
         "NQ": ["test"],
         "NQHardNegatives": ["test"],
-        "HotPotQA": ["test"],
+        "NQ-PL": ["test"],
+        "HotPotQA": ["test"],  # translated, not trained on
         "HotPotQAHardNegatives": ["test"],
         "HotPotQA-PL": ["test"],  # translated from hotpotQA (not trained on)
         "FEVER": ["test"],

diff --git a/mteb/models/e5_instruct.py b/mteb/models/e5_instruct.py
@@ -19,7 +19,6 @@
     **E5_TRAINING_DATA,
     "FEVER": ["train"],
     "FEVERHardNegatives": ["train"],
-    "FEVER-PL": ["train"],  # translation not trained on
     "HotpotQA": ["train"],
     "HotpotQAHardNegatives": ["train"],
     "HotpotQA-PL": ["train"],  # translation not trained on

diff --git a/mteb/models/e5_models.py b/mteb/models/e5_models.py
@@ -130,7 +130,6 @@
     **E5_TRAINING_DATA,
     "FEVER": ["train"],
     "FEVERHardNegatives": ["train"],
-    "FEVER-PL": ["train"],  # translation not trained on
     "HotpotQA": ["train"],
     "HotpotQAHardNegatives": ["train"],
     "HotpotQA-PL": ["train"],  # translation not trained on

diff --git a/mteb/models/gritlm_models.py b/mteb/models/gritlm_models.py
@@ -16,7 +16,6 @@
     # also uses medi2 which contains fever and hotpotqa:
     "FEVER": ["train"],
     "FEVERHardNegatives": ["train"],
-    "FEVER-PL": ["train"],  # translation not trained on
     "HotpotQA": ["train"],
     "HotpotQAHardNegatives": ["train"],
     "HotpotQA-PL": ["train"],  # translation not trained on

diff --git a/mteb/models/salesforce_models.py b/mteb/models/salesforce_models.py
@@ -22,7 +22,6 @@ def instruction_template(
     "FiQA2018-PL": ["train"],
     "FEVER": ["train"],
     "FEVERHardNegatives": ["train"],
-    "FEVER-PL": ["train"],  # translation not trained on
     "HotpotQA": ["train"],
     "HotpotQAHardNegatives": ["train"],
     "HotpotQA-PL": ["train"],  # translation not trained on

diff --git a/mteb/tasks/Classification/eng/FinancialPhrasebankClassification.py b/mteb/tasks/Classification/eng/FinancialPhrasebankClassification.py
@@ -22,7 +22,7 @@ class FinancialPhrasebankClassification(AbsTaskClassification):
         eval_langs=["eng-Latn"],
         main_score="accuracy",
         date=("2013-11-01", "2013-11-01"),
-        domains=["News", "Written"],
+        domains=["News", "Written", "Financial"],
         task_subtypes=["Sentiment/Hate speech"],
         license="cc-by-nc-sa-3.0",
         annotations_creators="expert-annotated",

diff --git a/mteb/tasks/Classification/kor/KorFin.py b/mteb/tasks/Classification/kor/KorFin.py
@@ -25,7 +25,7 @@ class KorFin(AbsTaskClassification):
             "2022-01-01",
             "2022-12-31",
         ),  # Assumed date based on the citations in the paper
-        domains=["News", "Written"],
+        domains=["News", "Written", "Financial"],
         task_subtypes=["Sentiment/Hate speech"],
         license="cc-by-sa-4.0",
         annotations_creators="expert-annotated",

diff --git a/mteb/tasks/Clustering/eng/ArxivClusteringS2S.py b/mteb/tasks/Clustering/eng/ArxivClusteringS2S.py
@@ -21,13 +21,13 @@ class ArxivClusteringS2S(AbsTaskClustering):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="v_measure",
-        date=None,
-        domains=None,
-        task_subtypes=None,
-        license=None,
-        annotations_creators=None,
-        dialect=None,
-        sample_creation=None,
+        date=("1991-01-01", "2021-01-01"),  # 1991-01-01 is the first arxiv paper
+        domains=["Academic", "Written"],
+        task_subtypes=[],
+        license="cc0-1.0",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
         bibtex_citation="""@misc{arxiv_org_submitters_2024,
     title={arXiv Dataset},
     url={https://www.kaggle.com/dsv/7548853},

diff --git a/mteb/tasks/Clustering/eng/RedditClustering.py b/mteb/tasks/Clustering/eng/RedditClustering.py
@@ -85,14 +85,13 @@ class RedditClustering(AbsTaskClustering):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="v_measure",
-        date=None,
-        form=None,
-        domains=None,
-        task_subtypes=None,
-        license=None,
-        annotations_creators=None,
-        dialect=None,
-        sample_creation=None,
+        date=("2021-01-01", "2021-04-14"),
+        domains=["Web", "Social", "Written"],
+        task_subtypes=["Thematic clustering"],
+        license="not specified",  # derived from pushshift
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
         bibtex_citation="""@article{geigle:2021:arxiv,
         author    = {Gregor Geigle and 
                         Nils Reimers and 

diff --git a/mteb/tasks/Clustering/eng/RedditClusteringP2P.py b/mteb/tasks/Clustering/eng/RedditClusteringP2P.py
@@ -29,14 +29,13 @@ class RedditClusteringP2P(AbsTaskClustering):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="v_measure",
-        date=None,
-        form=None,
-        domains=None,
-        task_subtypes=None,
-        license=None,
-        annotations_creators=None,
-        dialect=None,
-        sample_creation=None,
+        date=("2021-01-01", "2021-04-14"),
+        domains=["Web", "Social", "Written"],
+        task_subtypes=["Thematic clustering"],
+        license="not specified",  # derived from pushshift
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
         bibtex_citation="""@article{geigle:2021:arxiv,
         author    = {Gregor Geigle and 
                         Nils Reimers and 

diff --git a/mteb/tasks/Clustering/eng/StackExchangeClustering.py b/mteb/tasks/Clustering/eng/StackExchangeClustering.py
@@ -87,14 +87,13 @@ class StackExchangeClustering(AbsTaskClustering):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="v_measure",
-        date=None,
-        form=None,
-        domains=None,
-        task_subtypes=None,
-        license=None,
-        annotations_creators=None,
-        dialect=None,
-        sample_creation=None,
+        date=("2021-01-01", "2021-04-14"),
+        domains=["Web", "Written"],
+        task_subtypes=["Thematic clustering"],
+        license="not specified",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
         bibtex_citation="""@article{geigle:2021:arxiv,
         author    = {Gregor Geigle and 
                         Nils Reimers and 

diff --git a/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py b/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py
@@ -91,13 +91,13 @@ class StackExchangeClusteringP2P(AbsTaskClustering):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="v_measure",
-        date=None,
-        domains=None,
-        task_subtypes=None,
-        license=None,
-        annotations_creators=None,
-        dialect=None,
-        sample_creation=None,
+        date=("2021-01-01", "2021-04-14"),
+        domains=["Web", "Written"],
+        task_subtypes=["Thematic clustering"],
+        license="not specified",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
         bibtex_citation="""@article{geigle:2021:arxiv,
         author    = {Gregor Geigle and 
                         Nils Reimers and 

diff --git a/mteb/tasks/PairClassification/eng/TwitterSemEval2015PC.py b/mteb/tasks/PairClassification/eng/TwitterSemEval2015PC.py
@@ -21,12 +21,12 @@ class TwitterSemEval2015PC(AbsTaskPairClassification):
         eval_langs=["eng-Latn"],
         main_score="max_ap",
         date=None,
-        domains=None,
-        task_subtypes=None,
-        license=None,
-        annotations_creators=None,
-        dialect=None,
-        sample_creation=None,
+        domains=["Social", "Written"],
+        task_subtypes=[],
+        license="not specified",
+        annotations_creators="human-annotated",
+        dialect=[],
+        sample_creation="found",
         bibtex_citation="""@inproceedings{xu-etal-2015-semeval,
         title = "{S}em{E}val-2015 Task 1: Paraphrase and Semantic Similarity in {T}witter ({PIT})",
         author = "Xu, Wei  and

diff --git a/mteb/tasks/PairClassification/eng/TwitterURLCorpusPC.py b/mteb/tasks/PairClassification/eng/TwitterURLCorpusPC.py
@@ -21,12 +21,12 @@ class TwitterURLCorpusPC(AbsTaskPairClassification):
         eval_langs=["eng-Latn"],
         main_score="max_ap",
         date=None,
-        domains=None,
-        task_subtypes=None,
-        license=None,
-        annotations_creators=None,
-        dialect=None,
-        sample_creation=None,
+        domains=["Social", "Written"],
+        task_subtypes=[],
+        license="not specified",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
         bibtex_citation="""@inproceedings{lan-etal-2017-continuously,
             title = "A Continuously Growing Dataset of Sentential Paraphrases",
             author = "Lan, Wuwei  and

diff --git a/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py b/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py
@@ -21,12 +21,12 @@ class AskUbuntuDupQuestions(AbsTaskReranking):
         eval_langs=["eng-Latn"],
         main_score="map",
         date=None,
-        domains=None,
+        domains=["Programming", "Web"],
         task_subtypes=None,
         license=None,
-        annotations_creators=None,
-        dialect=None,
-        sample_creation=None,
+        annotations_creators="human-annotated",
+        dialect=[],
+        sample_creation="found",
         prompt="Retrieve duplicate questions from AskUbuntu forum",
         bibtex_citation="""@article{wang-2021-TSDAE,
     title = "TSDAE: Using Transformer-based Sequential Denoising Auto-Encoderfor Unsupervised Sentence Embedding Learning",

diff --git a/mteb/tasks/Reranking/eng/StackOverflowDupQuestions.py b/mteb/tasks/Reranking/eng/StackOverflowDupQuestions.py
@@ -20,13 +20,13 @@ class StackOverflowDupQuestions(AbsTaskReranking):
         eval_splits=["test"],
         eval_langs=["eng-Latn"],
         main_score="map",
-        date=None,
-        domains=None,
-        task_subtypes=None,
-        license=None,
-        annotations_creators=None,
-        dialect=None,
-        sample_creation=None,
+        date=("2014-01-21", "2018-01-01"),
+        domains=["Written", "Blog", "Programming"],
+        task_subtypes=["Question answering"],
+        license="cc-by-nc-sa-4.0",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
         prompt="Retrieve duplicate questions from StackOverflow forum",
         bibtex_citation="""@article{Liu2018LinkSOAD,
   title={LinkSO: a dataset for learning to retrieve similar question answer pairs on software development forums},

diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py
@@ -5,6 +5,7 @@
 from .code.CodeEditSearchRetrieval import *
 from .code.CodeFeedbackMTRetrieval import *
 from .code.CodeFeedbackSTRetrieval import *
+from .code.CodeRAG import *
 from .code.CodeSearchNetCCRetrieval import *
 from .code.CodeSearchNetRetrieval import *
 from .code.CodeTransOceanContestRetrieval import *