From e2a27a72088187459a33a58213cd6514d8fbcc19 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Mon, 11 Nov 2024 12:02:47 +0100
Subject: [PATCH 01/43] Add swiss legal evals as new community tasks

---
 community_tasks/swiss_legal_evals.py | 402 +++++++++++++++++++++++++++
 1 file changed, 402 insertions(+)
 create mode 100644 community_tasks/swiss_legal_evals.py
diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
new file mode 100644
index 000000000..5b03b8406
--- /dev/null
+++ b/community_tasks/swiss_legal_evals.py
@@ -0,0 +1,402 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# ruff: noqa: F405, F403, F401
+"""
+This module contains task configurations and prompt functions for evaluating
+LLM models on Swiss legal datasets. Each task is defined using the 
+`LightevalTaskConfig` class with its respective prompt function. The tasks 
+cover a variety of benchmarks, including: translation of laws, court decisions 
+and press releases.
+
+Author: Joel Niklaus
+"""
+from lighteval.logging.hierarchical_logger import hlog_warn
+import nltk
+
+nltk.download("punkt_tab")
+
+
+import numpy as np
+import re
+from dataclasses import dataclass
+
+
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.metrics_sample import JudgeLLMMixEval, BertScore
+from lighteval.metrics.utils.metric_utils import (
+    MetricCategory,
+    MetricUseCase,
+    SampleLevelMetric,
+    SampleLevelMetricGrouping,
+)
+from lighteval.metrics.imports.bert_scorer import BERTScorer
+from lighteval.metrics.normalizations import remove_braces, remove_braces_and_strip
+from lighteval.tasks.extended.mix_eval.judge_prompts import (
+    flow_judge_for_freeform_template,
+    gpt_judge_for_closeended_freeform,
+)
+
+
+# CUSTOM METRICS
+def process_judge_response(x):
+    search = re.search(r"<score>\s(\d)\s</score>", x)
+    return int(search.group(1)) if search else 0
+
+
+def process_judge_response_freeform_gpt(x):
+    search = re.search(r"\[\[(\d.\d)\]\]", x)
+    answer = float(search.group(1) if search else 0)
+    return answer
+
+
+def freeform_flow_judge():
+    return SampleLevelMetricGrouping(
+        metric_name=["llm_judge_mixeval_flow"],
+        higher_is_better={"judge_score_flow": True},
+        category=MetricCategory.LLM_AS_JUDGE,
+        use_case=MetricUseCase.SUMMARIZATION,
+        sample_level_fn=JudgeLLMMixEval(
+            judge_model_name="flowaicom/Flow-Judge-v0.1",
+            template=flow_judge_for_freeform_template,
+            process_judge_response=process_judge_response,
+            judge_backend="vllm",
+            short_judge_name="flow",
+        ).compute,
+        corpus_level_fn={
+            "judge_score_flow": np.mean,
+        },
+    )
+
+
+def freeform_gpt_judge(judge_model_name: str = "gpt-4o"):
+    return SampleLevelMetricGrouping(
+        metric_name=[f"llm_judge_mixeval_{judge_model_name}"],
+        higher_is_better={"judge_score_{judge_model_name}": True},
+        category=MetricCategory.LLM_AS_JUDGE,
+        use_case=MetricUseCase.SUMMARIZATION,
+        sample_level_fn=JudgeLLMMixEval(
+            judge_model_name=judge_model_name,
+            template=gpt_judge_for_closeended_freeform,
+            process_judge_response=process_judge_response_freeform_gpt,
+            judge_backend="openai",
+            short_judge_name=judge_model_name,
+        ).compute,
+        corpus_level_fn={
+            f"judge_score_{judge_model_name}": np.mean,
+        },
+    )
+
+
+def bert_score(model_type: str = "xlm-roberta-large"):
+    score = BertScore(
+        normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip
+    )
+    score.bert_scorer = BERTScorer(
+        # We could download the files from here and set the baseline_path ourselves:
+        # https://github.com/Tiiiger/bert_score/tree/master/bert_score/rescale_baseline
+        model_type=model_type,
+        lang=None,  # Needs to be set if rescale_with_baseline is True
+        rescale_with_baseline=False,
+        baseline_path=None,
+    )
+    return SampleLevelMetricGrouping(
+        metric_name=["BERTScore-P", "BERTScore-R", "BERTScore-F"],
+        sample_level_fn=score.compute,
+        category=MetricCategory.GENERATIVE,
+        use_case=MetricUseCase.SUMMARIZATION,
+        corpus_level_fn={
+            "BERTScore-P": np.mean,
+            "BERTScore-R": np.mean,
+            "BERTScore-F": np.mean,
+        },
+        higher_is_better={
+            "BERTScore-P": True,
+            "BERTScore-R": True,
+            "BERTScore-F": True,
+        },
+    )
+
+
+class BLEURT:
+    def __init__(self, model_size: str = "tiny", seq_len: int = 512):
+        """Creates a BLEURT scorer based on the model size (tiny, base, large) and sequence length (128, 512)."""
+        assert model_size in [
+            "tiny",
+            "base",
+            "large",
+        ], "Model size must be either tiny, base, or large"
+        assert seq_len in [128, 512], "Sequence length must be either 128 or 512"
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            f"Elron/bleurt-{model_size}-{seq_len}"
+        )
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            f"Elron/bleurt-{model_size}-{seq_len}"
+        )
+        self.max_length = seq_len
+        self.model.eval()
+
+    def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
+        """Uses the stored BLEURT scorer to compute the score on the current sample.
+
+        Args:
+            golds (list[str]): Reference targets
+            predictions (list[str]): Predicted strings
+
+        Returns:
+            float: Score over the current sample's items.
+        """
+        if len(predictions) == 1:
+            predictions = predictions * len(golds)
+        inputs = self.tokenizer(
+            golds,
+            predictions,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=self.max_length,
+        )
+        if any(len(encoding) == self.max_length for encoding in inputs["input_ids"]):
+            hlog_warn(
+                f"Some inputs were truncated to max_length={self.max_length} in BLEURT scoring"
+            )
+        scores = self.model(**inputs)[0].squeeze()
+        return scores.item()
+
+
+def bleurt(model_size: str = "tiny", seq_len: int = 512):
+    return SampleLevelMetric(
+        metric_name="bleurt",
+        sample_level_fn=BLEURT(model_size=model_size, seq_len=seq_len).compute,
+        category=MetricCategory.GENERATIVE,
+        use_case=MetricUseCase.TRANSLATION,
+        corpus_level_fn=np.mean,
+        higher_is_better=True,
+    )
+
+
+# EVALS WITH SUBSET
+# This is how you create a subset task (like MMLU), which has several subset
+# each being its own evaluation task.
+
+
+def create_translation_pairs(langs_list: list) -> list[tuple]:
+    """
+    Create all possible translation pairs from a given list of languages.
+
+    Args:
+    langs_list (list): A list of languages.
+
+    Returns:
+    lang_pairs_list (list): A list of tuples representing a translation pair.
+    """
+    lang_pairs_list = []
+    for i, lang1 in enumerate(langs_list):
+        for lang2 in langs_list[i + 1 :]:
+            lang_pairs_list.append((lang1, lang2))
+            lang_pairs_list.append((lang2, lang1))
+    return lang_pairs_list
+
+
+@dataclass
+class LevelConfig:
+    name: str
+    text_col_name: str
+    prompt_prefix: str
+    metadata_cols: list[str]
+
+
+@dataclass
+class DatasetConfig:
+    name: str
+    hf_repo: str
+    languages: list[str]
+    subsets: dict[str, LevelConfig]
+
+    def __post_init__(self):
+        self.translation_pairs = create_translation_pairs(self.languages)
+
+
+SwissDecisionSummaryTranslations = DatasetConfig(
+    name="sdst",
+    hf_repo="joelniklaus/SwissDecisionSummaryTranslations",
+    languages=["de", "fr", "it"],
+    subsets={
+        "bge_level": LevelConfig(
+            name="bge_level",
+            text_col_name="bgeText",
+            prompt_prefix="Consider the following summary of a Swiss leading court decision",
+            metadata_cols=["bge"],
+        ),
+        "regeste_level": LevelConfig(
+            name="regeste_level",
+            text_col_name="regesteText",
+            prompt_prefix="Consider the following paragraph of a summary of a Swiss leading court decision",
+            metadata_cols=["bge"],
+        ),
+        "text_level": LevelConfig(
+            name="text_level",
+            text_col_name="text",
+            prompt_prefix="Consider the following sentence of a summary of a Swiss leading court decision",
+            metadata_cols=["bge"],
+        ),
+    },
+)
+
+
+SwissLawTranslations = DatasetConfig(
+    name="slt",
+    hf_repo="joelniklaus/SwissLawTranslations",
+    languages=["de", "fr", "it", "rm", "en"],
+    subsets={
+        "law_level": LevelConfig(
+            name="law_level",
+            text_col_name="lawText",
+            prompt_prefix="Consider the following Swiss federal law",
+            metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
+        ),
+        "article_level": LevelConfig(
+            name="article_level",
+            text_col_name="articleText",
+            prompt_prefix="Consider the following Swiss federal law article",
+            metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
+        ),
+        "paragraph_level": LevelConfig(
+            name="paragraph_level",
+            text_col_name="paragraphText",
+            prompt_prefix="Consider the following Swiss federal law paragraph",
+            metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
+        ),
+    },
+)
+
+SwissSupremeCourtPressReleaseTranslations = DatasetConfig(
+    name="sscprt",
+    hf_repo="joelniklaus/SwissSupremeCourtPressReleaseTranslations",
+    languages=["de", "fr", "it"],
+    subsets={
+        "press_release": LevelConfig(
+            name="press_release",
+            text_col_name="text",
+            prompt_prefix="Consider the following Swiss Supreme Court press release",
+            metadata_cols=["filename"],
+        )
+    },
+)
+
+
+def create_prompt_fn(level_config: LevelConfig, src_lang: str, target_lang: str):
+    """
+    Create a prompt function for a given level configuration.
+    """
+    text_col = level_config.text_col_name
+    src_text_col = f"{src_lang}_{text_col}"
+    target_text_col = f"{target_lang}_{text_col}"
+
+    def prompt_fn(line: dict, task_name: str = None):
+        custom_query = f"{level_config.prompt_prefix}: {line[src_text_col]}\nTranslate from {src_lang} to {target_lang}.\nTranslation: "
+
+        return Doc(
+            task_name=task_name,
+            query=custom_query,
+            choices=[str(line[target_text_col])],
+            gold_index=0,
+            specific={
+                **{col: line[col] for col in level_config.metadata_cols},
+                "question": custom_query,
+            },
+        )
+
+    return prompt_fn
+
+
+class TranslationTask(LightevalTaskConfig):
+    def __init__(
+        self,
+        dataset_config: DatasetConfig,
+        level_name: str,
+        src_lang: str,
+        target_lang: str,
+    ):
+        super().__init__(
+            name=f"{dataset_config.name}-{level_name}:{src_lang}-{target_lang}",
+            suite=["community"],
+            prompt_function=create_prompt_fn(
+                dataset_config.subsets[level_name], src_lang, target_lang
+            ),
+            hf_repo=dataset_config.hf_repo,
+            hf_subset=level_name,
+            hf_filter=None,
+            hf_avail_splits=["train", "validation", "test"],
+            evaluation_splits=["test"],  # ["validation", "test"],
+            few_shots_split="validation",
+            few_shots_select=None,
+            generation_size=10,
+            metric=[
+                Metrics.bleu,
+                Metrics.bleu_1,
+                Metrics.bleu_4,
+                Metrics.chrf,
+                Metrics.ter,
+                bert_score(model_type="xlm-roberta-large"),
+                bleurt(model_size="tiny", seq_len=512),
+                freeform_gpt_judge(judge_model_name="gpt-4o"),
+                # freeform_flow_judge(), # TODO: Needs to be tested on GPU machine
+                # TODO: add prometheus eval
+            ],
+            stop_sequence=["\n"],
+            trust_dataset=True,
+        )
+
+
+# STORE YOUR EVALS
+
+# list of all the subsets to use for this eval
+DATASETS = [
+    SwissDecisionSummaryTranslations,
+    SwissLawTranslations,
+    SwissSupremeCourtPressReleaseTranslations,
+]
+
+TASKS_TABLE = [
+    TranslationTask(
+        dataset_config=dataset,
+        level_name=subset,
+        src_lang=src_lang,
+        target_lang=target_lang,
+    )
+    for dataset in DATASETS
+    for subset in dataset.subsets
+    for src_lang, target_lang in dataset.translation_pairs
+]
+
+
+# MODULE LOGIC
+# You should not need to touch this
+# Convert to dict for lighteval
+if __name__ == "__main__":
+    print(t.name for t in TASKS_TABLE)
+    print(len(TASKS_TABLE))

From aa409c831f7a882619ba89b55e76c254bdbea0c2 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Tue, 12 Nov 2024 10:34:14 +0100
Subject: [PATCH 02/43] Removed nltk and numpy dependencies.

---
 community_tasks/swiss_legal_evals.py | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 5b03b8406..9b0d8d45a 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -31,12 +31,8 @@
 Author: Joel Niklaus
 """
 from lighteval.logging.hierarchical_logger import hlog_warn
-import nltk
 
-nltk.download("punkt_tab")
-
-
-import numpy as np
+import statistics
 import re
 from dataclasses import dataclass
 
@@ -86,7 +82,7 @@ def freeform_flow_judge():
             short_judge_name="flow",
         ).compute,
         corpus_level_fn={
-            "judge_score_flow": np.mean,
+            "judge_score_flow": statistics.mean,
         },
     )
 
@@ -105,7 +101,7 @@ def freeform_gpt_judge(judge_model_name: str = "gpt-4o"):
             short_judge_name=judge_model_name,
         ).compute,
         corpus_level_fn={
-            f"judge_score_{judge_model_name}": np.mean,
+            f"judge_score_{judge_model_name}": statistics.mean,
         },
     )
 
@@ -128,9 +124,9 @@ def bert_score(model_type: str = "xlm-roberta-large"):
         category=MetricCategory.GENERATIVE,
         use_case=MetricUseCase.SUMMARIZATION,
         corpus_level_fn={
-            "BERTScore-P": np.mean,
-            "BERTScore-R": np.mean,
-            "BERTScore-F": np.mean,
+            "BERTScore-P": statistics.mean,
+            "BERTScore-R": statistics.mean,
+            "BERTScore-F": statistics.mean,
         },
         higher_is_better={
             "BERTScore-P": True,
@@ -193,7 +189,7 @@ def bleurt(model_size: str = "tiny", seq_len: int = 512):
         sample_level_fn=BLEURT(model_size=model_size, seq_len=seq_len).compute,
         category=MetricCategory.GENERATIVE,
         use_case=MetricUseCase.TRANSLATION,
-        corpus_level_fn=np.mean,
+        corpus_level_fn=statistics.mean,
         higher_is_better=True,
     )
 

From a8ee2a5cd37e82fdd8d5dfb12b46ffb7a2a3f0c5 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Tue, 12 Nov 2024 10:37:11 +0100
Subject: [PATCH 03/43] Added short dataset descriptions.

---
 community_tasks/swiss_legal_evals.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 9b0d8d45a..549391ce2 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -236,6 +236,7 @@ def __post_init__(self):
         self.translation_pairs = create_translation_pairs(self.languages)
 
 
+# Translation of Swiss Federal Supreme Court Decision Summaries on three levels: the entire decision, the regeste level and the text level.
 SwissDecisionSummaryTranslations = DatasetConfig(
     name="sdst",
     hf_repo="joelniklaus/SwissDecisionSummaryTranslations",
@@ -262,7 +263,7 @@ def __post_init__(self):
     },
 )
 
-
+# Translation of Swiss Federal Laws on three levels: the entire law, the article level and the paragraph level.
 SwissLawTranslations = DatasetConfig(
     name="slt",
     hf_repo="joelniklaus/SwissLawTranslations",
@@ -289,6 +290,7 @@ def __post_init__(self):
     },
 )
 
+# Translation of Swiss Federal Supreme Court Press Releases on one level: the entire press release.
 SwissSupremeCourtPressReleaseTranslations = DatasetConfig(
     name="sscprt",
     hf_repo="joelniklaus/SwissSupremeCourtPressReleaseTranslations",

From c7f703807acef2edf234e999f817bd9f7c07c418 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Wed, 13 Nov 2024 16:00:32 +0100
Subject: [PATCH 04/43] Removed open judge models and added COMET and METEOR.

---
 community_tasks/swiss_legal_evals.py | 168 +++++++++++++++++++--------
 1 file changed, 122 insertions(+), 46 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 549391ce2..54b01c4fb 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -30,14 +30,20 @@
 
 Author: Joel Niklaus
 """
-from lighteval.logging.hierarchical_logger import hlog_warn
 
 import statistics
-import re
 from dataclasses import dataclass
+from packaging import version
+import importlib.metadata as importlib_metadata
+
+import nltk
+from nltk import word_tokenize
+from nltk.translate import meteor_score
 
+from comet import download_model, load_from_checkpoint
 
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from lighteval.logging.hierarchical_logger import hlog_warn
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
 from lighteval.metrics.metrics import Metrics
@@ -50,46 +56,19 @@
 )
 from lighteval.metrics.imports.bert_scorer import BERTScorer
 from lighteval.metrics.normalizations import remove_braces, remove_braces_and_strip
+from lighteval.tasks.extended.mix_eval.main import process_judge_response_freeform_gpt
 from lighteval.tasks.extended.mix_eval.judge_prompts import (
-    flow_judge_for_freeform_template,
     gpt_judge_for_closeended_freeform,
 )
 
 
 # CUSTOM METRICS
-def process_judge_response(x):
-    search = re.search(r"<score>\s(\d)\s</score>", x)
-    return int(search.group(1)) if search else 0
-
-
-def process_judge_response_freeform_gpt(x):
-    search = re.search(r"\[\[(\d.\d)\]\]", x)
-    answer = float(search.group(1) if search else 0)
-    return answer
-
-
-def freeform_flow_judge():
-    return SampleLevelMetricGrouping(
-        metric_name=["llm_judge_mixeval_flow"],
-        higher_is_better={"judge_score_flow": True},
-        category=MetricCategory.LLM_AS_JUDGE,
-        use_case=MetricUseCase.SUMMARIZATION,
-        sample_level_fn=JudgeLLMMixEval(
-            judge_model_name="flowaicom/Flow-Judge-v0.1",
-            template=flow_judge_for_freeform_template,
-            process_judge_response=process_judge_response,
-            judge_backend="vllm",
-            short_judge_name="flow",
-        ).compute,
-        corpus_level_fn={
-            "judge_score_flow": statistics.mean,
-        },
-    )
+# TODO: adjust prompt
 
 
 def freeform_gpt_judge(judge_model_name: str = "gpt-4o"):
     return SampleLevelMetricGrouping(
-        metric_name=[f"llm_judge_mixeval_{judge_model_name}"],
+        metric_name=[f"llm_judge_{judge_model_name}"],
         higher_is_better={"judge_score_{judge_model_name}": True},
         category=MetricCategory.LLM_AS_JUDGE,
         use_case=MetricUseCase.SUMMARIZATION,
@@ -106,7 +85,7 @@ def freeform_gpt_judge(judge_model_name: str = "gpt-4o"):
     )
 
 
-def bert_score(model_type: str = "xlm-roberta-large"):
+def get_bert_score(model_type: str = "xlm-roberta-large"):
     score = BertScore(
         normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip
     )
@@ -136,6 +115,9 @@ def bert_score(model_type: str = "xlm-roberta-large"):
     )
 
 
+bert_score = get_bert_score(model_type="xlm-roberta-large")
+
+
 class BLEURT:
     def __init__(self, model_size: str = "tiny", seq_len: int = 512):
         """Creates a BLEURT scorer based on the model size (tiny, base, large) and sequence length (128, 512)."""
@@ -183,16 +165,105 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         return scores.item()
 
 
-def bleurt(model_size: str = "tiny", seq_len: int = 512):
-    return SampleLevelMetric(
-        metric_name="bleurt",
-        sample_level_fn=BLEURT(model_size=model_size, seq_len=seq_len).compute,
-        category=MetricCategory.GENERATIVE,
-        use_case=MetricUseCase.TRANSLATION,
-        corpus_level_fn=statistics.mean,
-        higher_is_better=True,
-    )
+bleurt = SampleLevelMetric(
+    metric_name="bleurt",
+    sample_level_fn=BLEURT(model_size="tiny", seq_len=512).compute,
+    category=MetricCategory.GENERATIVE,
+    use_case=MetricUseCase.TRANSLATION,
+    corpus_level_fn=statistics.mean,
+    higher_is_better=True,
+)
+
+
+class COMET:
+    def __init__(
+        self,
+        model_name: str = "Unbabel/wmt22-comet-da",
+        batch_size: int = 1,
+        gpus: int = 1,
+        accelerator: str = "cpu",  # "mps" leads to errors
+    ):
+        model_path = download_model(model_name)
+        self.model = load_from_checkpoint(model_path)
+        self.batch_size = batch_size
+        self.gpus = gpus
+        self.accelerator = accelerator
+
+    def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
+        data = [
+            {"src": src, "mt": pred, "ref": gold}
+            for src, pred, gold in zip(
+                [kwargs["formatted_doc"].specific["source"]] * len(predictions),
+                predictions,
+                golds,
+            )
+        ]
+        model_output = self.model.predict(
+            data,
+            batch_size=self.batch_size,
+            gpus=self.gpus,
+            accelerator=self.accelerator,
+        )
+        # model_output["scores"] contains the sentence level scores
+        return model_output["system_score"]
+
+
+comet = SampleLevelMetric(
+    metric_name="comet",
+    sample_level_fn=COMET(accelerator="cpu").compute,
+    category=MetricCategory.GENERATIVE,
+    use_case=MetricUseCase.TRANSLATION,
+    corpus_level_fn=statistics.mean,
+    higher_is_better=True,
+)
+
 
+class METEOR:
+    def __init__(self, alpha=0.9, beta=3, gamma=0.5):
+        self.alpha = alpha
+        self.beta = beta
+        self.gamma = gamma
+
+        NLTK_VERSION = version.parse(importlib_metadata.version("nltk"))
+        assert NLTK_VERSION >= version.Version("3.9.0"), "NLTK version must be >= 3.9.0"
+        nltk.download("punkt_tab")
+        nltk.download("wordnet")
+
+    def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
+        if isinstance(golds[0], list):  # multiple references
+            scores = [
+                meteor_score.meteor_score(
+                    [word_tokenize(ref) for ref in refs],
+                    word_tokenize(pred),
+                    alpha=self.alpha,
+                    beta=self.beta,
+                    gamma=self.gamma,
+                )
+                for refs, pred in zip(golds, predictions)
+            ]
+        else:
+            scores = [
+                meteor_score.single_meteor_score(
+                    word_tokenize(ref),
+                    word_tokenize(pred),
+                    alpha=self.alpha,
+                    beta=self.beta,
+                    gamma=self.gamma,
+                )
+                for ref, pred in zip(golds, predictions)
+            ]
+
+        return statistics.mean(scores)
+
+
+meteor = SampleLevelMetric(
+    metric_name="meteor",
+    sample_level_fn=METEOR().compute,
+    category=MetricCategory.GENERATIVE,
+    use_case=MetricUseCase.TRANSLATION,
+    corpus_level_fn=statistics.mean,
+    higher_is_better=True,
+)
 
 # EVALS WITH SUBSET
 # This is how you create a subset task (like MMLU), which has several subset
@@ -315,6 +386,7 @@ def create_prompt_fn(level_config: LevelConfig, src_lang: str, target_lang: str)
     target_text_col = f"{target_lang}_{text_col}"
 
     def prompt_fn(line: dict, task_name: str = None):
+        # TODO: replace this with the prompt template
         custom_query = f"{level_config.prompt_prefix}: {line[src_text_col]}\nTranslate from {src_lang} to {target_lang}.\nTranslation: "
 
         return Doc(
@@ -325,6 +397,7 @@ def prompt_fn(line: dict, task_name: str = None):
             specific={
                 **{col: line[col] for col in level_config.metadata_cols},
                 "question": custom_query,
+                "source": line[src_text_col],
             },
         )
 
@@ -359,11 +432,14 @@ def __init__(
                 Metrics.bleu_4,
                 Metrics.chrf,
                 Metrics.ter,
-                bert_score(model_type="xlm-roberta-large"),
-                bleurt(model_size="tiny", seq_len=512),
+                bert_score,
+                bleurt,
+                comet,
+                meteor,
                 freeform_gpt_judge(judge_model_name="gpt-4o"),
-                # freeform_flow_judge(), # TODO: Needs to be tested on GPU machine
-                # TODO: add prometheus eval
+                # Additionally we could consider adding the following open source judge models:
+                # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0
+                # However, these are only fine-tuned on English data and we need multilingual support.
             ],
             stop_sequence=["\n"],
             trust_dataset=True,

From 5d41ce0a4f646e65419787155e545fc443c99fc0 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Wed, 20 Nov 2024 11:39:33 +0100
Subject: [PATCH 05/43] Ran pre-commit hooks.

---
 community_tasks/swiss_legal_evals.py | 47 +++++++++++-----------------
 1 file changed, 18 insertions(+), 29 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 54b01c4fb..5b47e1d64 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -23,43 +23,42 @@
 # ruff: noqa: F405, F403, F401
 """
 This module contains task configurations and prompt functions for evaluating
-LLM models on Swiss legal datasets. Each task is defined using the 
-`LightevalTaskConfig` class with its respective prompt function. The tasks 
-cover a variety of benchmarks, including: translation of laws, court decisions 
+LLM models on Swiss legal datasets. Each task is defined using the
+`LightevalTaskConfig` class with its respective prompt function. The tasks
+cover a variety of benchmarks, including: translation of laws, court decisions
 and press releases.
 
 Author: Joel Niklaus
 """
 
+import importlib.metadata as importlib_metadata
 import statistics
 from dataclasses import dataclass
-from packaging import version
-import importlib.metadata as importlib_metadata
 
 import nltk
+from comet import download_model, load_from_checkpoint
 from nltk import word_tokenize
 from nltk.translate import meteor_score
+from packaging import version
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
-from comet import download_model, load_from_checkpoint
-
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from lighteval.logging.hierarchical_logger import hlog_warn
-from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.requests import Doc
+from lighteval.metrics.imports.bert_scorer import BERTScorer
 from lighteval.metrics.metrics import Metrics
-from lighteval.metrics.metrics_sample import JudgeLLMMixEval, BertScore
+from lighteval.metrics.metrics_sample import BertScore, JudgeLLMMixEval
+from lighteval.metrics.normalizations import remove_braces, remove_braces_and_strip
 from lighteval.metrics.utils.metric_utils import (
     MetricCategory,
     MetricUseCase,
     SampleLevelMetric,
     SampleLevelMetricGrouping,
 )
-from lighteval.metrics.imports.bert_scorer import BERTScorer
-from lighteval.metrics.normalizations import remove_braces, remove_braces_and_strip
-from lighteval.tasks.extended.mix_eval.main import process_judge_response_freeform_gpt
 from lighteval.tasks.extended.mix_eval.judge_prompts import (
     gpt_judge_for_closeended_freeform,
 )
+from lighteval.tasks.extended.mix_eval.main import process_judge_response_freeform_gpt
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
 
 
 # CUSTOM METRICS
@@ -86,9 +85,7 @@ def freeform_gpt_judge(judge_model_name: str = "gpt-4o"):
 
 
 def get_bert_score(model_type: str = "xlm-roberta-large"):
-    score = BertScore(
-        normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip
-    )
+    score = BertScore(normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip)
     score.bert_scorer = BERTScorer(
         # We could download the files from here and set the baseline_path ourselves:
         # https://github.com/Tiiiger/bert_score/tree/master/bert_score/rescale_baseline
@@ -128,12 +125,8 @@ def __init__(self, model_size: str = "tiny", seq_len: int = 512):
         ], "Model size must be either tiny, base, or large"
         assert seq_len in [128, 512], "Sequence length must be either 128 or 512"
 
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            f"Elron/bleurt-{model_size}-{seq_len}"
-        )
-        self.model = AutoModelForSequenceClassification.from_pretrained(
-            f"Elron/bleurt-{model_size}-{seq_len}"
-        )
+        self.tokenizer = AutoTokenizer.from_pretrained(f"Elron/bleurt-{model_size}-{seq_len}")
+        self.model = AutoModelForSequenceClassification.from_pretrained(f"Elron/bleurt-{model_size}-{seq_len}")
         self.max_length = seq_len
         self.model.eval()
 
@@ -158,9 +151,7 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
             max_length=self.max_length,
         )
         if any(len(encoding) == self.max_length for encoding in inputs["input_ids"]):
-            hlog_warn(
-                f"Some inputs were truncated to max_length={self.max_length} in BLEURT scoring"
-            )
+            hlog_warn(f"Some inputs were truncated to max_length={self.max_length} in BLEURT scoring")
         scores = self.model(**inputs)[0].squeeze()
         return scores.item()
 
@@ -415,9 +406,7 @@ def __init__(
         super().__init__(
             name=f"{dataset_config.name}-{level_name}:{src_lang}-{target_lang}",
             suite=["community"],
-            prompt_function=create_prompt_fn(
-                dataset_config.subsets[level_name], src_lang, target_lang
-            ),
+            prompt_function=create_prompt_fn(dataset_config.subsets[level_name], src_lang, target_lang),
             hf_repo=dataset_config.hf_repo,
             hf_subset=level_name,
             hf_filter=None,

From 81941254b99bc3a57f4a28f3784d4591fe945fda Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Wed, 20 Nov 2024 11:52:58 +0100
Subject: [PATCH 06/43] Changed prompt template.

---
 community_tasks/swiss_legal_evals.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 5b47e1d64..890b1b98d 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -283,7 +283,6 @@ def create_translation_pairs(langs_list: list) -> list[tuple]:
 class LevelConfig:
     name: str
     text_col_name: str
-    prompt_prefix: str
     metadata_cols: list[str]
 
 
@@ -307,19 +306,16 @@ def __post_init__(self):
         "bge_level": LevelConfig(
             name="bge_level",
             text_col_name="bgeText",
-            prompt_prefix="Consider the following summary of a Swiss leading court decision",
             metadata_cols=["bge"],
         ),
         "regeste_level": LevelConfig(
             name="regeste_level",
             text_col_name="regesteText",
-            prompt_prefix="Consider the following paragraph of a summary of a Swiss leading court decision",
             metadata_cols=["bge"],
         ),
         "text_level": LevelConfig(
             name="text_level",
             text_col_name="text",
-            prompt_prefix="Consider the following sentence of a summary of a Swiss leading court decision",
             metadata_cols=["bge"],
         ),
     },
@@ -334,19 +330,16 @@ def __post_init__(self):
         "law_level": LevelConfig(
             name="law_level",
             text_col_name="lawText",
-            prompt_prefix="Consider the following Swiss federal law",
             metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
         ),
         "article_level": LevelConfig(
             name="article_level",
             text_col_name="articleText",
-            prompt_prefix="Consider the following Swiss federal law article",
             metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
         ),
         "paragraph_level": LevelConfig(
             name="paragraph_level",
             text_col_name="paragraphText",
-            prompt_prefix="Consider the following Swiss federal law paragraph",
             metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
         ),
     },
@@ -361,7 +354,6 @@ def __post_init__(self):
         "press_release": LevelConfig(
             name="press_release",
             text_col_name="text",
-            prompt_prefix="Consider the following Swiss Supreme Court press release",
             metadata_cols=["filename"],
         )
     },
@@ -377,8 +369,8 @@ def create_prompt_fn(level_config: LevelConfig, src_lang: str, target_lang: str)
     target_text_col = f"{target_lang}_{text_col}"
 
     def prompt_fn(line: dict, task_name: str = None):
-        # TODO: replace this with the prompt template
-        custom_query = f"{level_config.prompt_prefix}: {line[src_text_col]}\nTranslate from {src_lang} to {target_lang}.\nTranslation: "
+        # Following Template A from https://github.com/huggingface/lighteval/pull/389#issuecomment-2471580177
+        custom_query = f"{src_lang.upper()}: {line[src_text_col]}\n{target_lang.upper()}:"
 
         return Doc(
             task_name=task_name,

From c58ae447be1efd449d5498eaf7b4138c34799412 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Thu, 21 Nov 2024 15:30:54 +0100
Subject: [PATCH 07/43] Added legal translation specific judge prompt.

---
 community_tasks/swiss_legal_evals.py | 95 ++++++++++++++++++++++++++--
 1 file changed, 90 insertions(+), 5 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 890b1b98d..62113d1b2 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -53,16 +53,101 @@
     SampleLevelMetric,
     SampleLevelMetricGrouping,
 )
-from lighteval.tasks.extended.mix_eval.judge_prompts import (
-    gpt_judge_for_closeended_freeform,
-)
 from lighteval.tasks.extended.mix_eval.main import process_judge_response_freeform_gpt
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
 
 
 # CUSTOM METRICS
-# TODO: adjust prompt
+
+
+def swiss_legal_translation_judge(question, options, answer, gold):
+    return [
+        {
+            "role": "system",
+            "content": "Act as a Judge specializing in the evaluation of translations of Swiss legal documents. Your task is to assess the accuracy, clarity, and fidelity of the model's translation to the golden translation, while considering the nuances of legal language.",
+        },
+        {
+            "role": "user",
+            "content": f"""You will be provided with a source text, its golden translation, and the model's translation. Your task is to judge how correct the model's translation is based on the golden translation, and then give a correctness score. The correctness score should be one of the below numbers: 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). You should first briefly give your reasoning process regarding how the model's translation conforms to or contradicts the golden translation, and then give the correctness score. The correctness score must strictly follow this format: \"[[score]]\", e.g., \"The correctness score: [[0.5]]\". Below are some examples.
+
+Example 1:
+Source Text:
+Art. 20
+A contract is void if its terms are impossible, unlawful or immoral. However, where the defect pertains only to certain terms of a contract, those terms alone are void unless there is cause to assume that the contract would not have been concluded without them.
+
+Golden Translation:
+Il contratto che ha per oggetto una cosa impossibile o contraria alle leggi od ai buoni costumi è nullo. Se il contratto è viziato solo in alcune parti, queste soltanto sono nulle, ove non si debba ammettere che senza la parte nulla esso non sarebbe stato conchiuso.
+
+Model’s Translation:
+Il contratto è nullo se le sue clausole sono impossibili, illecite o immorali. Tuttavia, quando il vizio riguarda solo determinate clausole del contratto, solo queste sono nulle, salvo che vi sia motivo di ritenere che il contratto non sarebbe stato concluso senza di esse.
+
+Your Judgment: The model’s translation aligns well with the golden translation in terms of accuracy, clarity, and fidelity to the source text. However, there are minor stylistic differences. For example, the golden translation uses “conchiuso,” an older and more formal term, while the model opts for “concluso,” which is modern. Similarly, the golden translation uses the idiomatic phrase “contraria alle leggi od ai buoni costumi,” whereas the model employs the more literal “illecite o immorali”. The correctness score: [[0.9]]
+
+Example 2:
+Source Text:
+Art. 13 Abs. 2, Art. 36 Abs. 1 BV; Art. 141 Abs. 2 StPO; Verwertbarkeit von polizeilichen Aufzeichnungen der automatischen Fahrzeugfahndung und Verkehrsüberwachung (AFV).
+Die Erhebung und die Aufbewahrung von Aufzeichnungen der AFV stellen einen Eingriff in die Grundrechte der Betroffenen dar, insbesondere in das Recht auf Privatsphäre, das den Anspruch auf informationelle Selbstbestimmung miteinschliesst (E. 3.1). Für die AFV besteht im Kanton Thurgau keine hinreichend bestimmte gesetzliche Grundlage. Der mit der Überwachung verbundene Eingriff in die Privatsphäre verstösst daher gegen Art. 13 Abs. 2 i.V.m. Art. 36 Abs. 1 BV (E. 3.2 und 3.3).
+Stellt die Polizei im Rahmen ihrer präventiven Kontrolltätigkeit strafbare Handlungen fest, ermittelt sie nach Art. 306 ff. StPO. Die Frage, ob die mangels gesetzlicher Grundlage rechtswidrig erhobenen Beweismittel im Strafprozess verwertbar sind, ist nach Art. 141 Abs. 2 StPO zu prüfen (Präzisierung der Rechtsprechung; E. 4.1 und 4.2). Verwertbarkeit im vorliegenden Fall verneint (E. 4.3).
+
+Golden Translation:
+Art. 13 al. 2, art. 36 al. 1 Cst.; art. 141 al. 2 CPP; exploitabilité d'enregistrements réalisés par la police au moyen du Système de recherche automatisée de véhicules et surveillance du trafic (RVS). La réalisation et le stockage d'enregistrements par la RVS constituent une atteinte aux droits fondamentaux des personnes concernées, singulièrement à la protection de leur sphère privée, qui englobe le droit à l'autodétermination en matière de données (consid. 3.1). Le recours à la RVS ne repose sur aucune base légale suffisamment précise dans le canton de Thurgovie. L'atteinte à la sphère privée inhérente à cette surveillance viole partant l'art. 13 al. 2 en corrélation avec l'art. 36 al. 1 Cst. (consid. 3.2 et 3.3). La constatation d'infractions par la police dans le cadre de son activité de contrôle préventive, relève de l'investigation au sens des art. 306 ss CPP. L'exploitabilité dans le procès pénal de preuves recueillies de manière illicite faute de base légale doit être examinée à l'aune de l'art. 141 al. 2 CPP (précision de la jurisprudence; consid. 4.1 et 4.2). Exploitabilité niée en l'espèce (consid. 4.3).
+
+Model’s Translation:
+Art. 13 al. 2, Art. 36 al. 1 Cst.; Art. 141 al. 2 CPP; Admissibilité des enregistrements de la poursuite automatisée des véhicules et de la surveillance de la circulation (AFV).
+La collecte et la conservation des enregistrements de l'AFV constituent une ingérence dans les droits fondamentaux des personnes concernées, en particulier dans le droit à la vie privée, qui inclut le droit à l'autodétermination informationnelle (consid. 3.1). Dans le canton de Thurgovie, il n'existe pas de base légale suffisamment déterminée pour l'AFV. Par conséquent, l'ingérence dans la vie privée associée à la surveillance viole donc les art. 13 al. 2 combiné avec l'art. 36 al. 1 Cst (consid. 3.2 et 3.3).
+Si la police relève des infractions pénales dans le cadre de ses activités de contrôle préventif, elle enquête conformément aux art. 306 et suivants CPP. La question de savoir si les preuves illégalement recueillies peuvent être utilisées dans la procédure pénale est examinée conformément à l'art. 141 al. 2 CPP (précision de la jurisprudence; consid. 4.1 et 4.2). Admissibilité dans le cas présent niée (consid. 4.3).
+
+Your Judgment: The model’s translation mostly aligns with the golden translation but diverges when it comes to accuracy and fidelity to Swiss legal terminology. For instance, the term “exploitabilité” which is closer to the Swiss provision is replaced in the model’s translation with “admissibilité”. Similarly, “ingérence” is used instead of “atteinte”, although “atteinte” is commonly used in Swiss law to discuss a violation of fundamental rights. Also, the term "recherche automatisée de véhicules et surveillance du trafic (RVS)" used by the golden translation is more established than "poursuite automatisée des véhicules et de la surveillance de la circulation (AFV)" in the model’s translation. The model’s translation is almost complete, but omits a critical point in one sentence: that the evidence was unlawfully obtained due to lack of a sufficiently clear legal basis. This omission impacts the completeness. The correctness score: [[0.7]]
+
+Example 3:
+Source Text:
+Yoko Ono est propriétaire de la montre de John Lennon – rejet du recours d'un collectionneur contre un arrêt rendu par la Cour de justice genevoise
+
+Le Tribunal fédéral rejette le recours déposé par un collectionneur contre l'arrêt de la Cour de justice genevoise par lequel celle-ci confirmait que Yoko Ono est propriétaire de la montre qu'elle avait offerte à John Lennon en 1980, deux mois avant qu'il ne soit assassiné. Le collectionneur, qui a remis la montre à une maison de vente aux enchères genevoise en 2014 afin d'en faire estimer la valeur, a quant à lui revendiqué la propriété de ladite montre.
+
+En 1980, Yoko Ono a acquis à New York une montre de marque Patek Philippe. Elle y a fait graver au dos l'inscription « (JUST LIKE) STARTING OVER LOVE YOKO 10·9·1980 N.Y.C » et l'a offerte à son époux, John Lennon, le 9 octobre 1980 pour son 40e anniversaire. Le 8 décembre 1980, John Lennon a été assassiné à New York. La montre a été répertoriée dans l'inventaire successoral et conservée dans une pièce de l'appartement de Yoko Ono à New York. Par la suite, la montre s'est retrouvée aux mains d'un homme qui avait été le chauffeur privé de Yoko Ono de 1995 à 2006. Un autre possesseur intermédiaire l'a remise à une maison de vente aux enchères allemande, où elle a été acquise par un collectionneur en 2014. Ce dernier l'a remise la même année à une maison de vente aux enchères genevoise afin d'en faire estimer la valeur, ce dont a été informée Yoko Ono. Cette dernière n'avait jusqu'alors pas eu conscience du fait que la montre n'était plus en sa possession. En 2018, le collectionneur a formé à Genève une action visant à constater sa qualité de propriétaire, action à laquelle Yoko Ono s'est opposée. En 2022, le tribunal de première instance genevois a constaté que Yoko Ono était la seule et unique propriétaire de la montre, ce que la Cour de justice du canton de Genève, statuant sur appel du collectionneur, a confirmé en 2023.
+
+Le Tribunal fédéral rejette le recours déposé par le collectionneur contre cet arrêt. Il n'est tout d'abord pas contesté que la propriété de la montre a été acquise par succession par Yoko Ono après le décès de John Lennon. C'est en outre sans arbitraire que la Cour de justice genevoise a retenu que la montre avait été volée par l'ancien chauffeur et que, à l'inverse, aucun élément ne permettait de démontrer que Yoko Ono aurait eu l'intention de faire donation au chauffeur d'une chose si particulière que la montre, gravée d'une inscription, qu'elle avait offerte à John Lennon deux mois avant son décès. Dès lors qu'il s'agit d'une chose volée, le collectionneur, aujourd'hui recourant, ne pouvait pas acquérir la propriété de la montre par un mode originaire d'acquisition lorsqu'il l'a achetée en Allemagne en 2014 ; selon le droit allemand applicable en la matière, cela vaut indépendamment du fait que l'acquéreur était ou non de bonne foi quant à l'origine de la chose.
+
+Golden Translation:
+Yoko Ono ist Eigentümerin der Uhr von John Lennon –
+Beschwerde von Sammler gegen Genfer Urteil abgewiesen
+
+Das Bundesgericht weist die Beschwerde eines Sammlers gegen das Urteil des Genfer Kantonsgerichts ab, mit dem Yoko Ono als Eigentümerin der Uhr bestätigt wurde, die sie John Lennon 1980 zwei Monate vor seiner Ermordung geschenkt hat. Der Sammler hatte die Uhr 2014 zur Schätzung bei einem Auktionshaus in Genf eingereicht und seinerseits Eigentümerschaft an der Uhr geltend gemacht.
+Yoko Ono hatte 1980 in New York eine Uhr der Marke Patek Philippe gekauft. Sie liess auf der Rückseite die Gravur "(JUST LIKE) STARTING OVER LOVE YOKO 10·9·1980 N.Y.C" anbringen und schenkte sie ihrem Ehemann John Lennon am 9. Oktober 1980 zum 40. Geburtstag. Am 8. Dezember 1980 wurde John Lennon in New York ermordet. Die Uhr wurde ins Erbschaftsinventar aufgenommen und in einem Zimmer der Wohnung von Yoko Ono in New York aufbewahrt. Sie gelangte von dort in die Hände eines Mannes, der von 1995 bis 2006 Privatchauffeur von Yoko Ono gewesen war. Ein weiterer Zwischenbesitzer brachte die Uhr in ein deutsches Auktionshaus, wo sie 2014 von einem Sammler erworben wurde. Dieser reichte die Uhr im gleichen Jahr bei einem Auktionshaus in Genf zur Schätzung ihres Wertes ein. Davon erfuhr Yoko Ono, die bis dahin keine Kenntnis davon gehabt hatte, dass sich die Uhr nicht mehr in ihrem Besitz befand. Der Sammler erhob 2018 in Genf eine Klage auf Feststellung seiner Eigentümerschaft, der sich Yoko Ono widersetzte. Das erstinstanzliche Genfer Gericht stellte 2022 fest, dass Yoko Ono die alleinige Eigentümerin der Uhr sei, was auf Berufung des Sammlers 2023 vom Kantonsgericht des Kantons Genf bestätigt wurde.
+
+Das Bundesgericht weist die dagegen erhobene Beschwerde des Sammlers ab. Unbestritten ist zunächst, dass das Eigentum an der Uhr nach dem Tod von John Lennon durch Erbschaft an Yoko Ono übergegangen ist. Sodann durfte das Genfer Kantonsgericht willkürfrei davon ausgehen, dass die Uhr vom ehemaligen Chauffeur gestohlen wurde und umgekehrt nichts dafür spricht, dass Yoko Ono dem Chauffeur dieses einzigartige Stück, welches sie John Lennon zwei Monate vor seinem Tod mit einer Inschrift versehen überreicht hatte, hätte schenken wollen. Da es sich bei der Uhr um eine gestohlene Sache handelt, konnte der Sammler und heutige Beschwerdeführer bei ihrem Erwerb 2014 in Deutschland nicht unmittelbar zum Eigentümer werden; dies gilt gemäss dem diesbezüglich anwendbaren deutschen Recht unabhängig davon, ob der Erwerber in Bezug auf die Herkunft der Sache in gutem Glauben war.
+
+
+Model’s Translation:
+Hier, der Bundesgerichtshof hat den Antrag des Käufer gegen den Urteil des Zivilgerichts in Zürich abgelehnt, das das Zivilgericht in Zürich 2023 bestätigt hatte, dass Yoko Ono die Uhr als Geschenk 1980 an John Lennon gegeben hat, bevor dieser am 8. Dezember 1980 ermordet wurde. Der Käufer, der die Uhr 2014 in eine Geneßische Auktion verkauft hat, behauptet, die Uhr sei ihm verliehen worden, und nicht von Yoko Ono.
+
+Im Jahr 1980 hat Yoko Ono in New York eine Uhr von der Marke Patek Philippe gekauft. Sie hat auf der Rückseite der Uhr die Inschrift "JUST LIKE STARTING OVER LOVE YOKO 10·9·1980 N.Y.C." gemalt und sie an ihren Ehemann, John Lennon, am 9. Oktober 1980 als Geburtstag gegeben. Am 8. Dezember 1980 wurde Lennon ermordet in New York. Die Uhr wurde in den Erbeverzeichnis des Lennon-Mädchens aufbewahrt und in einem Zimmer in Yoko Onos New Yorker Wohnung.
+
+Danach wurde die Uhr von einem Mann, der als Wagenkellner von Yoko Ono zwischen 1995 und 2006 gearbeitet hat, in die Hände gebracht. Ein weiterer mittlerer Eigentümer hat sie 2014 in eine deutsche Auktion verkauft, wo sie einem Käufer in Zürich übergeben wurde. Dieser Käufer hat die Uhr 2014 in eine Geneßische Auktion verkauft, um sie zu valieren, und Yoko Ono war von diesem Moment an nicht mehr bewusst, dass die Uhr nicht mehr in ihrer Hand war.
+
+Im Jahr 2018 hat der Käufer in Zürich eine Anzeige gegen Yoko Ono eröffnet, um die Qualität des Eigentums der Uhr zu bestätigen, was Yoko Ono ansprach. Im Jahr 2022 hat der Zivilgericht in Zürich bestätigt, dass Yoko Ono die einzige und letzte Eigentümerin der Uhr ist, was der Zivilgerichtshof im Jahr 2023 bestätigt hat.
+
+Der Bundesgerichtshof hat den Antrag des Käufer gegen dieses Urteil abgelehnt. Er ist zunächst nicht bezweifelt, dass Yoko Ono die Uhr nach dem Tod von John Lennon durch Erbvergabe gekauft hat. Es ist auch nicht bezweifelt, dass die Zivilgerichtsverfassung in Zürich die Uhr als gestohlen angenommen hat, da niemand einen Beweis dafür gab, dass Yoko Ono die Uhr als Geschenk an den Wagenkellner hatte. Deshalb kann der Käufer nicht durch eine rechtliche Handlung die Uhr von Yoko Ono erwerben, wenn er die Uhr in Deutschland gekauft hat.
+
+Your Judgment: The model’s translation diverges significantly from the golden translation in accuracy, clarity, and fidelity. Critical legal terminology is mistranslated, omitted, and distorted. For instance, the courts are misidentified (“Zivilgerichtsverfassung”, “Zivilgericht”, “Bundesgerichtshof”). The model’s translation has grammatical errors, such as “Geneßische Auktion”, “Erbvergabe”, “Wagenkellner” and “zu valieren”. The model also omits the explanation that, under German law, stolen property cannot be acquired in good faith. The correctness score: [[0.2]]
+
+Judge the below case, give the brief reasoning process and the correctness score.
+
+
+Source:
+{question}
+
+Golden Translation:
+{gold}
+
+Model's Translation:
+{answer}
+
+Your Judgment:""",
+        },
+    ]
 
 
 def freeform_gpt_judge(judge_model_name: str = "gpt-4o"):
@@ -73,7 +158,7 @@ def freeform_gpt_judge(judge_model_name: str = "gpt-4o"):
         use_case=MetricUseCase.SUMMARIZATION,
         sample_level_fn=JudgeLLMMixEval(
             judge_model_name=judge_model_name,
-            template=gpt_judge_for_closeended_freeform,
+            template=swiss_legal_translation_judge,
             process_judge_response=process_judge_response_freeform_gpt,
             judge_backend="openai",
             short_judge_name=judge_model_name,

From ff3705f94ae1085035c79b1baeec50ae00c816de Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Thu, 21 Nov 2024 15:41:59 +0100
Subject: [PATCH 08/43] Improved judge prompt.

---
 community_tasks/swiss_legal_evals.py | 43 ++++++++++++++--------------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 62113d1b2..85245aab3 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -73,55 +73,56 @@ def swiss_legal_translation_judge(question, options, answer, gold):
 
 Example 1:
 Source Text:
-Art. 20
-A contract is void if its terms are impossible, unlawful or immoral. However, where the defect pertains only to certain terms of a contract, those terms alone are void unless there is cause to assume that the contract would not have been concluded without them.
+```A contract is void if its terms are impossible, unlawful or immoral. However, where the defect pertains only to certain terms of a contract, those terms alone are void unless there is cause to assume that the contract would not have been concluded without them.```
 
 Golden Translation:
-Il contratto che ha per oggetto una cosa impossibile o contraria alle leggi od ai buoni costumi è nullo. Se il contratto è viziato solo in alcune parti, queste soltanto sono nulle, ove non si debba ammettere che senza la parte nulla esso non sarebbe stato conchiuso.
+```Il contratto che ha per oggetto una cosa impossibile o contraria alle leggi od ai buoni costumi è nullo. Se il contratto è viziato solo in alcune parti, queste soltanto sono nulle, ove non si debba ammettere che senza la parte nulla esso non sarebbe stato conchiuso.```
 
 Model’s Translation:
-Il contratto è nullo se le sue clausole sono impossibili, illecite o immorali. Tuttavia, quando il vizio riguarda solo determinate clausole del contratto, solo queste sono nulle, salvo che vi sia motivo di ritenere che il contratto non sarebbe stato concluso senza di esse.
+```Il contratto è nullo se le sue clausole sono impossibili, illecite o immorali. Tuttavia, quando il vizio riguarda solo determinate clausole del contratto, solo queste sono nulle, salvo che vi sia motivo di ritenere che il contratto non sarebbe stato concluso senza di esse.```
 
 Your Judgment: The model’s translation aligns well with the golden translation in terms of accuracy, clarity, and fidelity to the source text. However, there are minor stylistic differences. For example, the golden translation uses “conchiuso,” an older and more formal term, while the model opts for “concluso,” which is modern. Similarly, the golden translation uses the idiomatic phrase “contraria alle leggi od ai buoni costumi,” whereas the model employs the more literal “illecite o immorali”. The correctness score: [[0.9]]
 
 Example 2:
 Source Text:
-Art. 13 Abs. 2, Art. 36 Abs. 1 BV; Art. 141 Abs. 2 StPO; Verwertbarkeit von polizeilichen Aufzeichnungen der automatischen Fahrzeugfahndung und Verkehrsüberwachung (AFV).
+```Art. 13 Abs. 2, Art. 36 Abs. 1 BV; Art. 141 Abs. 2 StPO; Verwertbarkeit von polizeilichen Aufzeichnungen der automatischen Fahrzeugfahndung und Verkehrsüberwachung (AFV).
 Die Erhebung und die Aufbewahrung von Aufzeichnungen der AFV stellen einen Eingriff in die Grundrechte der Betroffenen dar, insbesondere in das Recht auf Privatsphäre, das den Anspruch auf informationelle Selbstbestimmung miteinschliesst (E. 3.1). Für die AFV besteht im Kanton Thurgau keine hinreichend bestimmte gesetzliche Grundlage. Der mit der Überwachung verbundene Eingriff in die Privatsphäre verstösst daher gegen Art. 13 Abs. 2 i.V.m. Art. 36 Abs. 1 BV (E. 3.2 und 3.3).
-Stellt die Polizei im Rahmen ihrer präventiven Kontrolltätigkeit strafbare Handlungen fest, ermittelt sie nach Art. 306 ff. StPO. Die Frage, ob die mangels gesetzlicher Grundlage rechtswidrig erhobenen Beweismittel im Strafprozess verwertbar sind, ist nach Art. 141 Abs. 2 StPO zu prüfen (Präzisierung der Rechtsprechung; E. 4.1 und 4.2). Verwertbarkeit im vorliegenden Fall verneint (E. 4.3).
+Stellt die Polizei im Rahmen ihrer präventiven Kontrolltätigkeit strafbare Handlungen fest, ermittelt sie nach Art. 306 ff. StPO. Die Frage, ob die mangels gesetzlicher Grundlage rechtswidrig erhobenen Beweismittel im Strafprozess verwertbar sind, ist nach Art. 141 Abs. 2 StPO zu prüfen (Präzisierung der Rechtsprechung; E. 4.1 und 4.2). Verwertbarkeit im vorliegenden Fall verneint (E. 4.3).```
 
 Golden Translation:
-Art. 13 al. 2, art. 36 al. 1 Cst.; art. 141 al. 2 CPP; exploitabilité d'enregistrements réalisés par la police au moyen du Système de recherche automatisée de véhicules et surveillance du trafic (RVS). La réalisation et le stockage d'enregistrements par la RVS constituent une atteinte aux droits fondamentaux des personnes concernées, singulièrement à la protection de leur sphère privée, qui englobe le droit à l'autodétermination en matière de données (consid. 3.1). Le recours à la RVS ne repose sur aucune base légale suffisamment précise dans le canton de Thurgovie. L'atteinte à la sphère privée inhérente à cette surveillance viole partant l'art. 13 al. 2 en corrélation avec l'art. 36 al. 1 Cst. (consid. 3.2 et 3.3). La constatation d'infractions par la police dans le cadre de son activité de contrôle préventive, relève de l'investigation au sens des art. 306 ss CPP. L'exploitabilité dans le procès pénal de preuves recueillies de manière illicite faute de base légale doit être examinée à l'aune de l'art. 141 al. 2 CPP (précision de la jurisprudence; consid. 4.1 et 4.2). Exploitabilité niée en l'espèce (consid. 4.3).
+```Art. 13 al. 2, art. 36 al. 1 Cst.; art. 141 al. 2 CPP; exploitabilité d'enregistrements réalisés par la police au moyen du Système de recherche automatisée de véhicules et surveillance du trafic (RVS).
+La réalisation et le stockage d'enregistrements par la RVS constituent une atteinte aux droits fondamentaux des personnes concernées, singulièrement à la protection de leur sphère privée, qui englobe le droit à l'autodétermination en matière de données (consid. 3.1). Le recours à la RVS ne repose sur aucune base légale suffisamment précise dans le canton de Thurgovie. L'atteinte à la sphère privée inhérente à cette surveillance viole partant l'art. 13 al. 2 en corrélation avec l'art. 36 al. 1 Cst. (consid. 3.2 et 3.3).
+La constatation d'infractions par la police dans le cadre de son activité de contrôle préventive, relève de l'investigation au sens des art. 306 ss CPP. L'exploitabilité dans le procès pénal de preuves recueillies de manière illicite faute de base légale doit être examinée à l'aune de l'art. 141 al. 2 CPP (précision de la jurisprudence; consid. 4.1 et 4.2). Exploitabilité niée en l'espèce (consid. 4.3).```
 
 Model’s Translation:
-Art. 13 al. 2, Art. 36 al. 1 Cst.; Art. 141 al. 2 CPP; Admissibilité des enregistrements de la poursuite automatisée des véhicules et de la surveillance de la circulation (AFV).
+```Art. 13 al. 2, Art. 36 al. 1 Cst.; Art. 141 al. 2 CPP; Admissibilité des enregistrements de la poursuite automatisée des véhicules et de la surveillance de la circulation (AFV).
 La collecte et la conservation des enregistrements de l'AFV constituent une ingérence dans les droits fondamentaux des personnes concernées, en particulier dans le droit à la vie privée, qui inclut le droit à l'autodétermination informationnelle (consid. 3.1). Dans le canton de Thurgovie, il n'existe pas de base légale suffisamment déterminée pour l'AFV. Par conséquent, l'ingérence dans la vie privée associée à la surveillance viole donc les art. 13 al. 2 combiné avec l'art. 36 al. 1 Cst (consid. 3.2 et 3.3).
-Si la police relève des infractions pénales dans le cadre de ses activités de contrôle préventif, elle enquête conformément aux art. 306 et suivants CPP. La question de savoir si les preuves illégalement recueillies peuvent être utilisées dans la procédure pénale est examinée conformément à l'art. 141 al. 2 CPP (précision de la jurisprudence; consid. 4.1 et 4.2). Admissibilité dans le cas présent niée (consid. 4.3).
+Si la police relève des infractions pénales dans le cadre de ses activités de contrôle préventif, elle enquête conformément aux art. 306 et suivants CPP. La question de savoir si les preuves illégalement recueillies peuvent être utilisées dans la procédure pénale est examinée conformément à l'art. 141 al. 2 CPP (précision de la jurisprudence; consid. 4.1 et 4.2). Admissibilité dans le cas présent niée (consid. 4.3).```
 
 Your Judgment: The model’s translation mostly aligns with the golden translation but diverges when it comes to accuracy and fidelity to Swiss legal terminology. For instance, the term “exploitabilité” which is closer to the Swiss provision is replaced in the model’s translation with “admissibilité”. Similarly, “ingérence” is used instead of “atteinte”, although “atteinte” is commonly used in Swiss law to discuss a violation of fundamental rights. Also, the term "recherche automatisée de véhicules et surveillance du trafic (RVS)" used by the golden translation is more established than "poursuite automatisée des véhicules et de la surveillance de la circulation (AFV)" in the model’s translation. The model’s translation is almost complete, but omits a critical point in one sentence: that the evidence was unlawfully obtained due to lack of a sufficiently clear legal basis. This omission impacts the completeness. The correctness score: [[0.7]]
 
 Example 3:
 Source Text:
-Yoko Ono est propriétaire de la montre de John Lennon – rejet du recours d'un collectionneur contre un arrêt rendu par la Cour de justice genevoise
+```Yoko Ono est propriétaire de la montre de John Lennon – rejet du recours d'un collectionneur contre un arrêt rendu par la Cour de justice genevoise
 
 Le Tribunal fédéral rejette le recours déposé par un collectionneur contre l'arrêt de la Cour de justice genevoise par lequel celle-ci confirmait que Yoko Ono est propriétaire de la montre qu'elle avait offerte à John Lennon en 1980, deux mois avant qu'il ne soit assassiné. Le collectionneur, qui a remis la montre à une maison de vente aux enchères genevoise en 2014 afin d'en faire estimer la valeur, a quant à lui revendiqué la propriété de ladite montre.
 
 En 1980, Yoko Ono a acquis à New York une montre de marque Patek Philippe. Elle y a fait graver au dos l'inscription « (JUST LIKE) STARTING OVER LOVE YOKO 10·9·1980 N.Y.C » et l'a offerte à son époux, John Lennon, le 9 octobre 1980 pour son 40e anniversaire. Le 8 décembre 1980, John Lennon a été assassiné à New York. La montre a été répertoriée dans l'inventaire successoral et conservée dans une pièce de l'appartement de Yoko Ono à New York. Par la suite, la montre s'est retrouvée aux mains d'un homme qui avait été le chauffeur privé de Yoko Ono de 1995 à 2006. Un autre possesseur intermédiaire l'a remise à une maison de vente aux enchères allemande, où elle a été acquise par un collectionneur en 2014. Ce dernier l'a remise la même année à une maison de vente aux enchères genevoise afin d'en faire estimer la valeur, ce dont a été informée Yoko Ono. Cette dernière n'avait jusqu'alors pas eu conscience du fait que la montre n'était plus en sa possession. En 2018, le collectionneur a formé à Genève une action visant à constater sa qualité de propriétaire, action à laquelle Yoko Ono s'est opposée. En 2022, le tribunal de première instance genevois a constaté que Yoko Ono était la seule et unique propriétaire de la montre, ce que la Cour de justice du canton de Genève, statuant sur appel du collectionneur, a confirmé en 2023.
 
-Le Tribunal fédéral rejette le recours déposé par le collectionneur contre cet arrêt. Il n'est tout d'abord pas contesté que la propriété de la montre a été acquise par succession par Yoko Ono après le décès de John Lennon. C'est en outre sans arbitraire que la Cour de justice genevoise a retenu que la montre avait été volée par l'ancien chauffeur et que, à l'inverse, aucun élément ne permettait de démontrer que Yoko Ono aurait eu l'intention de faire donation au chauffeur d'une chose si particulière que la montre, gravée d'une inscription, qu'elle avait offerte à John Lennon deux mois avant son décès. Dès lors qu'il s'agit d'une chose volée, le collectionneur, aujourd'hui recourant, ne pouvait pas acquérir la propriété de la montre par un mode originaire d'acquisition lorsqu'il l'a achetée en Allemagne en 2014 ; selon le droit allemand applicable en la matière, cela vaut indépendamment du fait que l'acquéreur était ou non de bonne foi quant à l'origine de la chose.
+Le Tribunal fédéral rejette le recours déposé par le collectionneur contre cet arrêt. Il n'est tout d'abord pas contesté que la propriété de la montre a été acquise par succession par Yoko Ono après le décès de John Lennon. C'est en outre sans arbitraire que la Cour de justice genevoise a retenu que la montre avait été volée par l'ancien chauffeur et que, à l'inverse, aucun élément ne permettait de démontrer que Yoko Ono aurait eu l'intention de faire donation au chauffeur d'une chose si particulière que la montre, gravée d'une inscription, qu'elle avait offerte à John Lennon deux mois avant son décès. Dès lors qu'il s'agit d'une chose volée, le collectionneur, aujourd'hui recourant, ne pouvait pas acquérir la propriété de la montre par un mode originaire d'acquisition lorsqu'il l'a achetée en Allemagne en 2014 ; selon le droit allemand applicable en la matière, cela vaut indépendamment du fait que l'acquéreur était ou non de bonne foi quant à l'origine de la chose.```
 
 Golden Translation:
-Yoko Ono ist Eigentümerin der Uhr von John Lennon –
-Beschwerde von Sammler gegen Genfer Urteil abgewiesen
+```Yoko Ono ist Eigentümerin der Uhr von John Lennon – Beschwerde von Sammler gegen Genfer Urteil abgewiesen
 
 Das Bundesgericht weist die Beschwerde eines Sammlers gegen das Urteil des Genfer Kantonsgerichts ab, mit dem Yoko Ono als Eigentümerin der Uhr bestätigt wurde, die sie John Lennon 1980 zwei Monate vor seiner Ermordung geschenkt hat. Der Sammler hatte die Uhr 2014 zur Schätzung bei einem Auktionshaus in Genf eingereicht und seinerseits Eigentümerschaft an der Uhr geltend gemacht.
+
 Yoko Ono hatte 1980 in New York eine Uhr der Marke Patek Philippe gekauft. Sie liess auf der Rückseite die Gravur "(JUST LIKE) STARTING OVER LOVE YOKO 10·9·1980 N.Y.C" anbringen und schenkte sie ihrem Ehemann John Lennon am 9. Oktober 1980 zum 40. Geburtstag. Am 8. Dezember 1980 wurde John Lennon in New York ermordet. Die Uhr wurde ins Erbschaftsinventar aufgenommen und in einem Zimmer der Wohnung von Yoko Ono in New York aufbewahrt. Sie gelangte von dort in die Hände eines Mannes, der von 1995 bis 2006 Privatchauffeur von Yoko Ono gewesen war. Ein weiterer Zwischenbesitzer brachte die Uhr in ein deutsches Auktionshaus, wo sie 2014 von einem Sammler erworben wurde. Dieser reichte die Uhr im gleichen Jahr bei einem Auktionshaus in Genf zur Schätzung ihres Wertes ein. Davon erfuhr Yoko Ono, die bis dahin keine Kenntnis davon gehabt hatte, dass sich die Uhr nicht mehr in ihrem Besitz befand. Der Sammler erhob 2018 in Genf eine Klage auf Feststellung seiner Eigentümerschaft, der sich Yoko Ono widersetzte. Das erstinstanzliche Genfer Gericht stellte 2022 fest, dass Yoko Ono die alleinige Eigentümerin der Uhr sei, was auf Berufung des Sammlers 2023 vom Kantonsgericht des Kantons Genf bestätigt wurde.
 
-Das Bundesgericht weist die dagegen erhobene Beschwerde des Sammlers ab. Unbestritten ist zunächst, dass das Eigentum an der Uhr nach dem Tod von John Lennon durch Erbschaft an Yoko Ono übergegangen ist. Sodann durfte das Genfer Kantonsgericht willkürfrei davon ausgehen, dass die Uhr vom ehemaligen Chauffeur gestohlen wurde und umgekehrt nichts dafür spricht, dass Yoko Ono dem Chauffeur dieses einzigartige Stück, welches sie John Lennon zwei Monate vor seinem Tod mit einer Inschrift versehen überreicht hatte, hätte schenken wollen. Da es sich bei der Uhr um eine gestohlene Sache handelt, konnte der Sammler und heutige Beschwerdeführer bei ihrem Erwerb 2014 in Deutschland nicht unmittelbar zum Eigentümer werden; dies gilt gemäss dem diesbezüglich anwendbaren deutschen Recht unabhängig davon, ob der Erwerber in Bezug auf die Herkunft der Sache in gutem Glauben war.
+Das Bundesgericht weist die dagegen erhobene Beschwerde des Sammlers ab. Unbestritten ist zunächst, dass das Eigentum an der Uhr nach dem Tod von John Lennon durch Erbschaft an Yoko Ono übergegangen ist. Sodann durfte das Genfer Kantonsgericht willkürfrei davon ausgehen, dass die Uhr vom ehemaligen Chauffeur gestohlen wurde und umgekehrt nichts dafür spricht, dass Yoko Ono dem Chauffeur dieses einzigartige Stück, welches sie John Lennon zwei Monate vor seinem Tod mit einer Inschrift versehen überreicht hatte, hätte schenken wollen. Da es sich bei der Uhr um eine gestohlene Sache handelt, konnte der Sammler und heutige Beschwerdeführer bei ihrem Erwerb 2014 in Deutschland nicht unmittelbar zum Eigentümer werden; dies gilt gemäss dem diesbezüglich anwendbaren deutschen Recht unabhängig davon, ob der Erwerber in Bezug auf die Herkunft der Sache in gutem Glauben war.```
 
 
 Model’s Translation:
-Hier, der Bundesgerichtshof hat den Antrag des Käufer gegen den Urteil des Zivilgerichts in Zürich abgelehnt, das das Zivilgericht in Zürich 2023 bestätigt hatte, dass Yoko Ono die Uhr als Geschenk 1980 an John Lennon gegeben hat, bevor dieser am 8. Dezember 1980 ermordet wurde. Der Käufer, der die Uhr 2014 in eine Geneßische Auktion verkauft hat, behauptet, die Uhr sei ihm verliehen worden, und nicht von Yoko Ono.
+```Hier, der Bundesgerichtshof hat den Antrag des Käufer gegen den Urteil des Zivilgerichts in Zürich abgelehnt, das das Zivilgericht in Zürich 2023 bestätigt hatte, dass Yoko Ono die Uhr als Geschenk 1980 an John Lennon gegeben hat, bevor dieser am 8. Dezember 1980 ermordet wurde. Der Käufer, der die Uhr 2014 in eine Geneßische Auktion verkauft hat, behauptet, die Uhr sei ihm verliehen worden, und nicht von Yoko Ono.
 
 Im Jahr 1980 hat Yoko Ono in New York eine Uhr von der Marke Patek Philippe gekauft. Sie hat auf der Rückseite der Uhr die Inschrift "JUST LIKE STARTING OVER LOVE YOKO 10·9·1980 N.Y.C." gemalt und sie an ihren Ehemann, John Lennon, am 9. Oktober 1980 als Geburtstag gegeben. Am 8. Dezember 1980 wurde Lennon ermordet in New York. Die Uhr wurde in den Erbeverzeichnis des Lennon-Mädchens aufbewahrt und in einem Zimmer in Yoko Onos New Yorker Wohnung.
 
@@ -129,21 +130,21 @@ def swiss_legal_translation_judge(question, options, answer, gold):
 
 Im Jahr 2018 hat der Käufer in Zürich eine Anzeige gegen Yoko Ono eröffnet, um die Qualität des Eigentums der Uhr zu bestätigen, was Yoko Ono ansprach. Im Jahr 2022 hat der Zivilgericht in Zürich bestätigt, dass Yoko Ono die einzige und letzte Eigentümerin der Uhr ist, was der Zivilgerichtshof im Jahr 2023 bestätigt hat.
 
-Der Bundesgerichtshof hat den Antrag des Käufer gegen dieses Urteil abgelehnt. Er ist zunächst nicht bezweifelt, dass Yoko Ono die Uhr nach dem Tod von John Lennon durch Erbvergabe gekauft hat. Es ist auch nicht bezweifelt, dass die Zivilgerichtsverfassung in Zürich die Uhr als gestohlen angenommen hat, da niemand einen Beweis dafür gab, dass Yoko Ono die Uhr als Geschenk an den Wagenkellner hatte. Deshalb kann der Käufer nicht durch eine rechtliche Handlung die Uhr von Yoko Ono erwerben, wenn er die Uhr in Deutschland gekauft hat.
+Der Bundesgerichtshof hat den Antrag des Käufer gegen dieses Urteil abgelehnt. Er ist zunächst nicht bezweifelt, dass Yoko Ono die Uhr nach dem Tod von John Lennon durch Erbvergabe gekauft hat. Es ist auch nicht bezweifelt, dass die Zivilgerichtsverfassung in Zürich die Uhr als gestohlen angenommen hat, da niemand einen Beweis dafür gab, dass Yoko Ono die Uhr als Geschenk an den Wagenkellner hatte. Deshalb kann der Käufer nicht durch eine rechtliche Handlung die Uhr von Yoko Ono erwerben, wenn er die Uhr in Deutschland gekauft hat.```
 
-Your Judgment: The model’s translation diverges significantly from the golden translation in accuracy, clarity, and fidelity. Critical legal terminology is mistranslated, omitted, and distorted. For instance, the courts are misidentified (“Zivilgerichtsverfassung”, “Zivilgericht”, “Bundesgerichtshof”). The model’s translation has grammatical errors, such as “Geneßische Auktion”, “Erbvergabe”, “Wagenkellner” and “zu valieren”. The model also omits the explanation that, under German law, stolen property cannot be acquired in good faith. The correctness score: [[0.2]]
+Your Judgment: The model’s translation diverges significantly from the golden translation in accuracy, clarity, and fidelity. Critical legal terminology is mistranslated, omitted, and distorted. For instance, the courts are misidentified (“Zivilgerichtsverfassung”, “Zivilgericht”, “Bundesgerichtshof”). The model’s translation has several grammatical errors, such as “Geneßische Auktion”, “Erbvergabe”, “Wagenkellner” and “zu valieren”. The model also omits the explanation that, under German law, stolen property cannot be acquired in good faith. The correctness score: [[0.2]]
 
 Judge the below case, give the brief reasoning process and the correctness score.
 
 
-Source:
-{question}
+Source Text:
+```{question}```
 
 Golden Translation:
-{gold}
+```{gold}```
 
 Model's Translation:
-{answer}
+```{answer}```
 
 Your Judgment:""",
         },

From 091ec1134f523e7f253441ae48455aaff9ccd443 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Thu, 21 Nov 2024 16:56:22 +0100
Subject: [PATCH 09/43] Changed metric selection.

---
 community_tasks/swiss_legal_evals.py | 83 ++++++++++++++++++----------
 1 file changed, 55 insertions(+), 28 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 85245aab3..8958d3192 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -151,12 +151,13 @@ def swiss_legal_translation_judge(question, options, answer, gold):
     ]
 
 
-def freeform_gpt_judge(judge_model_name: str = "gpt-4o"):
+def get_swiss_legal_translation_judge(judge_model_name: str = "gpt-4o"):
+    name = f"swiss_legal_translation_judge_{judge_model_name}"
     return SampleLevelMetricGrouping(
-        metric_name=[f"llm_judge_{judge_model_name}"],
-        higher_is_better={"judge_score_{judge_model_name}": True},
+        metric_name=[name],
+        higher_is_better={name: True},
         category=MetricCategory.LLM_AS_JUDGE,
-        use_case=MetricUseCase.SUMMARIZATION,
+        use_case=MetricUseCase.TRANSLATION,
         sample_level_fn=JudgeLLMMixEval(
             judge_model_name=judge_model_name,
             template=swiss_legal_translation_judge,
@@ -165,11 +166,14 @@ def freeform_gpt_judge(judge_model_name: str = "gpt-4o"):
             short_judge_name=judge_model_name,
         ).compute,
         corpus_level_fn={
-            f"judge_score_{judge_model_name}": statistics.mean,
+            name: statistics.mean,
         },
     )
 
 
+swiss_legal_translation_judge_gpt_4o = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
+
+
 def get_bert_score(model_type: str = "xlm-roberta-large"):
     score = BertScore(normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip)
     score.bert_scorer = BERTScorer(
@@ -184,7 +188,7 @@ def get_bert_score(model_type: str = "xlm-roberta-large"):
         metric_name=["BERTScore-P", "BERTScore-R", "BERTScore-F"],
         sample_level_fn=score.compute,
         category=MetricCategory.GENERATIVE,
-        use_case=MetricUseCase.SUMMARIZATION,
+        use_case=MetricUseCase.TRANSLATION,
         corpus_level_fn={
             "BERTScore-P": statistics.mean,
             "BERTScore-R": statistics.mean,
@@ -242,14 +246,20 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         return scores.item()
 
 
-bleurt = SampleLevelMetric(
-    metric_name="bleurt",
-    sample_level_fn=BLEURT(model_size="tiny", seq_len=512).compute,
-    category=MetricCategory.GENERATIVE,
-    use_case=MetricUseCase.TRANSLATION,
-    corpus_level_fn=statistics.mean,
-    higher_is_better=True,
-)
+def get_bleurt(model_size: str = "tiny", seq_len: int = 512):
+    return SampleLevelMetric(
+        metric_name=f"bleurt_{model_size}",
+        sample_level_fn=BLEURT(model_size=model_size, seq_len=seq_len).compute,
+        category=MetricCategory.GENERATIVE,
+        use_case=MetricUseCase.TRANSLATION,
+        corpus_level_fn=statistics.mean,
+        higher_is_better=True,
+    )
+
+
+# bleurt_tiny = get_bleurt(model_size="tiny", seq_len=512)
+# bleurt_base = get_bleurt(model_size="base", seq_len=512)
+bleurt_large = get_bleurt(model_size="large", seq_len=512)
 
 
 class COMET:
@@ -285,14 +295,31 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         return model_output["system_score"]
 
 
-comet = SampleLevelMetric(
-    metric_name="comet",
-    sample_level_fn=COMET(accelerator="cpu").compute,
-    category=MetricCategory.GENERATIVE,
-    use_case=MetricUseCase.TRANSLATION,
-    corpus_level_fn=statistics.mean,
-    higher_is_better=True,
-)
+def get_comet(
+    model_name: str = "Unbabel/wmt22-comet-da",
+    batch_size: int = 1,
+    gpus: int = 1,
+    accelerator: str = "cpu",
+):
+    return SampleLevelMetric(
+        metric_name=model_name.split("/")[-1],
+        sample_level_fn=COMET(
+            model_name=model_name,
+            batch_size=batch_size,
+            gpus=gpus,
+            accelerator=accelerator,
+        ).compute,
+        category=MetricCategory.GENERATIVE,
+        use_case=MetricUseCase.TRANSLATION,
+        corpus_level_fn=statistics.mean,
+        higher_is_better=True,
+    )
+
+
+# There are also reference-free models (e.g., Unbabel/wmt22-cometkiwi-da), but since we have reference gold labels, we use the reference-based models.
+comet_wmt22_da = get_comet(model_name="Unbabel/wmt22-comet-da", batch_size=1, gpus=1, accelerator="cpu")
+# xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=1, gpus=1, accelerator="cpu")
+xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=1, gpus=1, accelerator="cpu")
 
 
 class METEOR:
@@ -495,15 +522,15 @@ def __init__(
             generation_size=10,
             metric=[
                 Metrics.bleu,
-                Metrics.bleu_1,
-                Metrics.bleu_4,
+                # Metrics.bleu_4,
                 Metrics.chrf,
                 Metrics.ter,
-                bert_score,
-                bleurt,
-                comet,
                 meteor,
-                freeform_gpt_judge(judge_model_name="gpt-4o"),
+                bert_score,
+                bleurt_large,
+                comet_wmt22_da,
+                xcomet_xxl,
+                swiss_legal_translation_judge_gpt_4o,
                 # Additionally we could consider adding the following open source judge models:
                 # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0
                 # However, these are only fine-tuned on English data and we need multilingual support.

From 5a479564f0ea3d4a294df1cab4015f2e66258de1 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Fri, 22 Nov 2024 13:59:25 +0100
Subject: [PATCH 10/43] Made generation_size dependent on the config.

---
 community_tasks/swiss_legal_evals.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 8958d3192..d0dc733a6 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -397,6 +397,7 @@ class LevelConfig:
     name: str
     text_col_name: str
     metadata_cols: list[str]
+    generation_size: int
 
 
 @dataclass
@@ -420,16 +421,19 @@ def __post_init__(self):
             name="bge_level",
             text_col_name="bgeText",
             metadata_cols=["bge"],
+            generation_size=2048,
         ),
         "regeste_level": LevelConfig(
             name="regeste_level",
             text_col_name="regesteText",
             metadata_cols=["bge"],
+            generation_size=512,
         ),
         "text_level": LevelConfig(
             name="text_level",
             text_col_name="text",
             metadata_cols=["bge"],
+            generation_size=256,
         ),
     },
 )
@@ -444,16 +448,19 @@ def __post_init__(self):
             name="law_level",
             text_col_name="lawText",
             metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
+            generation_size=16384,
         ),
         "article_level": LevelConfig(
             name="article_level",
             text_col_name="articleText",
             metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
+            generation_size=1024,
         ),
         "paragraph_level": LevelConfig(
             name="paragraph_level",
             text_col_name="paragraphText",
             metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
+            generation_size=256,
         ),
     },
 )
@@ -468,6 +475,7 @@ def __post_init__(self):
             name="press_release",
             text_col_name="text",
             metadata_cols=["filename"],
+            generation_size=1024,
         )
     },
 )
@@ -508,10 +516,11 @@ def __init__(
         src_lang: str,
         target_lang: str,
     ):
+        level_config = dataset_config.subsets[level_name]
         super().__init__(
             name=f"{dataset_config.name}-{level_name}:{src_lang}-{target_lang}",
             suite=["community"],
-            prompt_function=create_prompt_fn(dataset_config.subsets[level_name], src_lang, target_lang),
+            prompt_function=create_prompt_fn(level_config, src_lang, target_lang),
             hf_repo=dataset_config.hf_repo,
             hf_subset=level_name,
             hf_filter=None,
@@ -519,7 +528,7 @@ def __init__(
             evaluation_splits=["test"],  # ["validation", "test"],
             few_shots_split="validation",
             few_shots_select=None,
-            generation_size=10,
+            generation_size=level_config.generation_size,
             metric=[
                 Metrics.bleu,
                 # Metrics.bleu_4,

From 6bf7fa2463070f7c73cfbc2de0d5087953577952 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Fri, 22 Nov 2024 14:13:53 +0100
Subject: [PATCH 11/43] Fixed error in config.

---
 community_tasks/swiss_legal_evals.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index d0dc733a6..ae3a46e7d 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -452,13 +452,13 @@ def __post_init__(self):
         ),
         "article_level": LevelConfig(
             name="article_level",
-            text_col_name="articleText",
+            text_col_name="artText",
             metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
             generation_size=1024,
         ),
         "paragraph_level": LevelConfig(
             name="paragraph_level",
-            text_col_name="paragraphText",
+            text_col_name="parText",
             metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
             generation_size=256,
         ),

From 6cf1c2acd69a6fc35983276e3ab81fe27dc59f7f Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Fri, 22 Nov 2024 14:53:07 +0100
Subject: [PATCH 12/43] Fixed error in config.

---
 community_tasks/swiss_legal_evals.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index ae3a46e7d..c7242df74 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -447,19 +447,19 @@ def __post_init__(self):
         "law_level": LevelConfig(
             name="law_level",
             text_col_name="lawText",
-            metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
+            metadata_cols=["rsNr"],
             generation_size=16384,
         ),
         "article_level": LevelConfig(
             name="article_level",
             text_col_name="artText",
-            metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
+            metadata_cols=["rsNr"],
             generation_size=1024,
         ),
         "paragraph_level": LevelConfig(
             name="paragraph_level",
             text_col_name="parText",
-            metadata_cols=["abbreviation", "url", "dateApplicability", "rsNr"],
+            metadata_cols=["rsNr"],
             generation_size=256,
         ),
     },

From b5488017248605bee9f096f2efb688cae124eb5b Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Fri, 22 Nov 2024 14:58:48 +0100
Subject: [PATCH 13/43] Added support for multiple devices.

---
 community_tasks/swiss_legal_evals.py | 73 ++++++++++++++++++----------
 1 file changed, 47 insertions(+), 26 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index c7242df74..7ca161737 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -36,6 +36,7 @@
 from dataclasses import dataclass
 
 import nltk
+import torch
 from comet import download_model, load_from_checkpoint
 from nltk import word_tokenize
 from nltk.translate import meteor_score
@@ -58,6 +59,8 @@
 from lighteval.tasks.requests import Doc
 
 
+device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
+
 # CUSTOM METRICS
 
 
@@ -171,10 +174,10 @@ def get_swiss_legal_translation_judge(judge_model_name: str = "gpt-4o"):
     )
 
 
-swiss_legal_translation_judge_gpt_4o = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
-
-
-def get_bert_score(model_type: str = "xlm-roberta-large"):
+def get_bert_score(model_type: str = "xlm-roberta-large", device: str = "cpu"):
+    if device == "mps":
+        raise ValueError("MPS is not supported for BERTScore")
+    print(f"Loading BERTScore with model_type={model_type}, and device={device}...")
     score = BertScore(normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip)
     score.bert_scorer = BERTScorer(
         # We could download the files from here and set the baseline_path ourselves:
@@ -183,6 +186,7 @@ def get_bert_score(model_type: str = "xlm-roberta-large"):
         lang=None,  # Needs to be set if rescale_with_baseline is True
         rescale_with_baseline=False,
         baseline_path=None,
+        device=device,
     )
     return SampleLevelMetricGrouping(
         metric_name=["BERTScore-P", "BERTScore-R", "BERTScore-F"],
@@ -202,11 +206,13 @@ def get_bert_score(model_type: str = "xlm-roberta-large"):
     )
 
 
-bert_score = get_bert_score(model_type="xlm-roberta-large")
-
-
 class BLEURT:
-    def __init__(self, model_size: str = "tiny", seq_len: int = 512):
+    def __init__(
+        self,
+        model_size: str = "tiny",
+        seq_len: int = 512,
+        device: str = "cpu",
+    ):
         """Creates a BLEURT scorer based on the model size (tiny, base, large) and sequence length (128, 512)."""
         assert model_size in [
             "tiny",
@@ -214,11 +220,14 @@ def __init__(self, model_size: str = "tiny", seq_len: int = 512):
             "large",
         ], "Model size must be either tiny, base, or large"
         assert seq_len in [128, 512], "Sequence length must be either 128 or 512"
+        if device == "mps":
+            raise ValueError("MPS is not supported for BLEURT")
 
         self.tokenizer = AutoTokenizer.from_pretrained(f"Elron/bleurt-{model_size}-{seq_len}")
         self.model = AutoModelForSequenceClassification.from_pretrained(f"Elron/bleurt-{model_size}-{seq_len}")
-        self.max_length = seq_len
+        self.model = self.model.to(device)
         self.model.eval()
+        self.max_length = seq_len
 
     def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         """Uses the stored BLEURT scorer to compute the score on the current sample.
@@ -246,10 +255,11 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         return scores.item()
 
 
-def get_bleurt(model_size: str = "tiny", seq_len: int = 512):
+def get_bleurt(model_size: str = "tiny", seq_len: int = 512, device: str = "cpu"):
+    print(f"Loading BLEURT with model_size={model_size}, seq_len={seq_len}, and device={device}...")
     return SampleLevelMetric(
         metric_name=f"bleurt_{model_size}",
-        sample_level_fn=BLEURT(model_size=model_size, seq_len=seq_len).compute,
+        sample_level_fn=BLEURT(model_size=model_size, seq_len=seq_len, device=device).compute,
         category=MetricCategory.GENERATIVE,
         use_case=MetricUseCase.TRANSLATION,
         corpus_level_fn=statistics.mean,
@@ -257,19 +267,16 @@ def get_bleurt(model_size: str = "tiny", seq_len: int = 512):
     )
 
 
-# bleurt_tiny = get_bleurt(model_size="tiny", seq_len=512)
-# bleurt_base = get_bleurt(model_size="base", seq_len=512)
-bleurt_large = get_bleurt(model_size="large", seq_len=512)
-
-
 class COMET:
     def __init__(
         self,
         model_name: str = "Unbabel/wmt22-comet-da",
         batch_size: int = 1,
         gpus: int = 1,
-        accelerator: str = "cpu",  # "mps" leads to errors
+        accelerator: str = "cpu",
     ):
+        if accelerator == "mps":
+            raise ValueError("MPS is not supported for COMET")
         model_path = download_model(model_name)
         self.model = load_from_checkpoint(model_path)
         self.batch_size = batch_size
@@ -299,15 +306,16 @@ def get_comet(
     model_name: str = "Unbabel/wmt22-comet-da",
     batch_size: int = 1,
     gpus: int = 1,
-    accelerator: str = "cpu",
+    device: str = "cpu",
 ):
+    print(f"Loading COMET with model_name={model_name}, batch_size={batch_size}, gpus={gpus}, and device={device}...")
     return SampleLevelMetric(
         metric_name=model_name.split("/")[-1],
         sample_level_fn=COMET(
             model_name=model_name,
             batch_size=batch_size,
             gpus=gpus,
-            accelerator=accelerator,
+            accelerator=device,
         ).compute,
         category=MetricCategory.GENERATIVE,
         use_case=MetricUseCase.TRANSLATION,
@@ -316,12 +324,6 @@ def get_comet(
     )
 
 
-# There are also reference-free models (e.g., Unbabel/wmt22-cometkiwi-da), but since we have reference gold labels, we use the reference-based models.
-comet_wmt22_da = get_comet(model_name="Unbabel/wmt22-comet-da", batch_size=1, gpus=1, accelerator="cpu")
-# xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=1, gpus=1, accelerator="cpu")
-xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=1, gpus=1, accelerator="cpu")
-
-
 class METEOR:
     def __init__(self, alpha=0.9, beta=3, gamma=0.5):
         self.alpha = alpha
@@ -508,6 +510,25 @@ def prompt_fn(line: dict, task_name: str = None):
     return prompt_fn
 
 
+bert_score = get_bert_score(model_type="xlm-roberta-large", device="cpu")
+
+# Only take the largest version
+bleurt_large = get_bleurt(model_size="large", seq_len=512, device="cpu")
+
+# There are also reference-free models (e.g., Unbabel/wmt22-cometkiwi-da), but since we have reference gold labels, we use the reference-based models.
+comet_wmt22_da = get_comet(
+    model_name="Unbabel/wmt22-comet-da",
+    batch_size=1,
+    gpus=1,
+    device="cpu",
+)
+xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=1, gpus=1, device="cpu")
+# XXL is likely too large for local evaluation
+# xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=1, gpus=1, device="cpu")
+
+swiss_legal_translation_judge_gpt_4o = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
+
+
 class TranslationTask(LightevalTaskConfig):
     def __init__(
         self,
@@ -538,7 +559,7 @@ def __init__(
                 bert_score,
                 bleurt_large,
                 comet_wmt22_da,
-                xcomet_xxl,
+                xcomet_xl,
                 swiss_legal_translation_judge_gpt_4o,
                 # Additionally we could consider adding the following open source judge models:
                 # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0

From ee2a83c0245b5e8ee4715b8263c64034b5230f7e Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Mon, 25 Nov 2024 13:31:56 +0100
Subject: [PATCH 14/43] Fixed some bugs for evaluation on GPUs.

---
 community_tasks/swiss_legal_evals.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 7ca161737..05814a526 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -59,7 +59,7 @@
 from lighteval.tasks.requests import Doc
 
 
-device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
+device = "cuda" if torch.cuda.is_available() else "cpu"
 
 # CUSTOM METRICS
 
@@ -249,6 +249,7 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
             truncation=True,
             max_length=self.max_length,
         )
+        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
         if any(len(encoding) == self.max_length for encoding in inputs["input_ids"]):
             hlog_warn(f"Some inputs were truncated to max_length={self.max_length} in BLEURT scoring")
         scores = self.model(**inputs)[0].squeeze()
@@ -510,21 +511,20 @@ def prompt_fn(line: dict, task_name: str = None):
     return prompt_fn
 
 
-bert_score = get_bert_score(model_type="xlm-roberta-large", device="cpu")
+bert_score = get_bert_score(model_type="xlm-roberta-large", device=device)
 
 # Only take the largest version
-bleurt_large = get_bleurt(model_size="large", seq_len=512, device="cpu")
+bleurt_large = get_bleurt(model_size="large", seq_len=512, device=device)
 
 # There are also reference-free models (e.g., Unbabel/wmt22-cometkiwi-da), but since we have reference gold labels, we use the reference-based models.
 comet_wmt22_da = get_comet(
     model_name="Unbabel/wmt22-comet-da",
-    batch_size=1,
+    batch_size=32,
     gpus=1,
-    device="cpu",
+    device=device,
 )
-xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=1, gpus=1, device="cpu")
-# XXL is likely too large for local evaluation
-# xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=1, gpus=1, device="cpu")
+xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=8, gpus=1, device=device)
+xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=8, gpus=1, device=device)
 
 swiss_legal_translation_judge_gpt_4o = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
 
@@ -560,6 +560,7 @@ def __init__(
                 bleurt_large,
                 comet_wmt22_da,
                 xcomet_xl,
+                xcomet_xxl,
                 swiss_legal_translation_judge_gpt_4o,
                 # Additionally we could consider adding the following open source judge models:
                 # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0

From 36b7e943613931782573a8ca5ea04ce8836da6ab Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Tue, 26 Nov 2024 11:02:30 +0100
Subject: [PATCH 15/43] Added batch inference for heavy metrics and multiplied
 each score by 100.

---
 community_tasks/swiss_legal_evals.py | 178 ++++++++++++++-------------
 1 file changed, 90 insertions(+), 88 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 05814a526..205fad36b 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -161,16 +161,17 @@ def get_swiss_legal_translation_judge(judge_model_name: str = "gpt-4o"):
         higher_is_better={name: True},
         category=MetricCategory.LLM_AS_JUDGE,
         use_case=MetricUseCase.TRANSLATION,
-        sample_level_fn=JudgeLLMMixEval(
-            judge_model_name=judge_model_name,
-            template=swiss_legal_translation_judge,
-            process_judge_response=process_judge_response_freeform_gpt,
-            judge_backend="openai",
-            short_judge_name=judge_model_name,
-        ).compute,
-        corpus_level_fn={
-            name: statistics.mean,
-        },
+        sample_level_fn=lambda *args, **kwargs: [
+            {k: v * 100 if k == f"judge_score_{judge_model_name}" else v for k, v in score_dict.items()}
+            for score_dict in JudgeLLMMixEval(
+                judge_model_name=judge_model_name,
+                template=swiss_legal_translation_judge,
+                process_judge_response=process_judge_response_freeform_gpt,
+                judge_backend="openai",
+                short_judge_name=judge_model_name,
+            ).compute(*args, **kwargs)
+        ],
+        corpus_level_fn={name: statistics.mean},
     )
 
 
@@ -188,21 +189,22 @@ def get_bert_score(model_type: str = "xlm-roberta-large", device: str = "cpu"):
         baseline_path=None,
         device=device,
     )
+
     return SampleLevelMetricGrouping(
         metric_name=["BERTScore-P", "BERTScore-R", "BERTScore-F"],
-        sample_level_fn=score.compute,
+        higher_is_better={
+            "BERTScore-P": True,
+            "BERTScore-R": True,
+            "BERTScore-F": True,
+        },
         category=MetricCategory.GENERATIVE,
         use_case=MetricUseCase.TRANSLATION,
+        sample_level_fn=lambda *args, **kwargs: {k: v * 100 for k, v in score.compute(*args, **kwargs).items()},
         corpus_level_fn={
             "BERTScore-P": statistics.mean,
             "BERTScore-R": statistics.mean,
             "BERTScore-F": statistics.mean,
         },
-        higher_is_better={
-            "BERTScore-P": True,
-            "BERTScore-R": True,
-            "BERTScore-F": True,
-        },
     )
 
 
@@ -211,6 +213,7 @@ def __init__(
         self,
         model_size: str = "tiny",
         seq_len: int = 512,
+        batch_size: int = 32,
         device: str = "cpu",
     ):
         """Creates a BLEURT scorer based on the model size (tiny, base, large) and sequence length (128, 512)."""
@@ -223,48 +226,52 @@ def __init__(
         if device == "mps":
             raise ValueError("MPS is not supported for BLEURT")
 
+        self.metric_name = f"bleurt_{model_size}"
         self.tokenizer = AutoTokenizer.from_pretrained(f"Elron/bleurt-{model_size}-{seq_len}")
         self.model = AutoModelForSequenceClassification.from_pretrained(f"Elron/bleurt-{model_size}-{seq_len}")
         self.model = self.model.to(device)
         self.model.eval()
         self.max_length = seq_len
+        self.batch_size = batch_size
 
-    def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
-        """Uses the stored BLEURT scorer to compute the score on the current sample.
-
-        Args:
-            golds (list[str]): Reference targets
-            predictions (list[str]): Predicted strings
-
-        Returns:
-            float: Score over the current sample's items.
-        """
-        if len(predictions) == 1:
-            predictions = predictions * len(golds)
-        inputs = self.tokenizer(
-            golds,
-            predictions,
-            return_tensors="pt",
-            padding=True,
-            truncation=True,
-            max_length=self.max_length,
-        )
-        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
-        if any(len(encoding) == self.max_length for encoding in inputs["input_ids"]):
-            hlog_warn(f"Some inputs were truncated to max_length={self.max_length} in BLEURT scoring")
-        scores = self.model(**inputs)[0].squeeze()
-        return scores.item()
-
-
-def get_bleurt(model_size: str = "tiny", seq_len: int = 512, device: str = "cpu"):
-    print(f"Loading BLEURT with model_size={model_size}, seq_len={seq_len}, and device={device}...")
-    return SampleLevelMetric(
-        metric_name=f"bleurt_{model_size}",
-        sample_level_fn=BLEURT(model_size=model_size, seq_len=seq_len, device=device).compute,
-        category=MetricCategory.GENERATIVE,
+    def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc], **kwargs) -> dict[str, float]:
+        golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
+        predictions = [response[0].result[0] for response in responses]
+
+        all_scores = []
+        for i in range(0, len(golds), self.batch_size):
+            batch_golds = golds[i : i + self.batch_size]
+            batch_predictions = predictions[i : i + self.batch_size]
+
+            inputs = self.tokenizer(
+                batch_golds,
+                batch_predictions,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=self.max_length,
+            )
+            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+            if any(len(encoding) == self.max_length for encoding in inputs["input_ids"]):
+                hlog_warn(f"Some inputs were truncated to max_length={self.max_length} in BLEURT scoring")
+            with torch.no_grad():
+                all_scores.extend(self.model(**inputs)[0].squeeze().tolist())
+
+        return [{self.metric_name: score * 100} for score in all_scores]
+
+
+def get_bleurt(model_size: str = "tiny", seq_len: int = 512, batch_size: int = 32, device: str = "cpu"):
+    print(
+        f"Loading BLEURT with model_size={model_size}, seq_len={seq_len}, batch_size={batch_size}, and device={device}..."
+    )
+    name = f"bleurt_{model_size}"
+    return SampleLevelMetricGrouping(
+        metric_name=[name],
+        higher_is_better={name: True},
+        category=MetricCategory.LLM_AS_JUDGE,
         use_case=MetricUseCase.TRANSLATION,
-        corpus_level_fn=statistics.mean,
-        higher_is_better=True,
+        sample_level_fn=BLEURT(model_size=model_size, seq_len=seq_len, batch_size=batch_size, device=device).compute,
+        corpus_level_fn={name: statistics.mean},
     )
 
 
@@ -272,56 +279,55 @@ class COMET:
     def __init__(
         self,
         model_name: str = "Unbabel/wmt22-comet-da",
-        batch_size: int = 1,
+        batch_size: int = 8,
         gpus: int = 1,
         accelerator: str = "cpu",
     ):
         if accelerator == "mps":
             raise ValueError("MPS is not supported for COMET")
-        model_path = download_model(model_name)
-        self.model = load_from_checkpoint(model_path)
+
+        self.metric_name = model_name.split("/")[-1]
+        self.model = load_from_checkpoint(download_model(model_name))
         self.batch_size = batch_size
         self.gpus = gpus
         self.accelerator = accelerator
 
-    def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
-        data = [
-            {"src": src, "mt": pred, "ref": gold}
-            for src, pred, gold in zip(
-                [kwargs["formatted_doc"].specific["source"]] * len(predictions),
-                predictions,
-                golds,
-            )
-        ]
+    def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc], **kwargs) -> dict[str, float]:
+        golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
+        predictions = [response[0].result[0] for response in responses]
+        sources = [kwargs["formatted_doc"].specific["source"] for kwargs["formatted_doc"] in formatted_docs]
+
+        data = [{"src": src, "mt": pred, "ref": gold} for src, pred, gold in zip(sources, predictions, golds)]
         model_output = self.model.predict(
             data,
             batch_size=self.batch_size,
             gpus=self.gpus,
             accelerator=self.accelerator,
         )
-        # model_output["scores"] contains the sentence level scores
-        return model_output["system_score"]
+
+        return [{self.metric_name: score * 100} for score in model_output["scores"]]
 
 
 def get_comet(
     model_name: str = "Unbabel/wmt22-comet-da",
-    batch_size: int = 1,
+    batch_size: int = 8,
     gpus: int = 1,
     device: str = "cpu",
 ):
     print(f"Loading COMET with model_name={model_name}, batch_size={batch_size}, gpus={gpus}, and device={device}...")
-    return SampleLevelMetric(
-        metric_name=model_name.split("/")[-1],
+    name = model_name.split("/")[-1]
+    return SampleLevelMetricGrouping(
+        metric_name=[name],
+        higher_is_better={name: True},
+        category=MetricCategory.LLM_AS_JUDGE,
+        use_case=MetricUseCase.TRANSLATION,
         sample_level_fn=COMET(
             model_name=model_name,
             batch_size=batch_size,
             gpus=gpus,
             accelerator=device,
         ).compute,
-        category=MetricCategory.GENERATIVE,
-        use_case=MetricUseCase.TRANSLATION,
-        corpus_level_fn=statistics.mean,
-        higher_is_better=True,
+        corpus_level_fn={name: statistics.mean},
     )
 
 
@@ -360,18 +366,19 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
                 for ref, pred in zip(golds, predictions)
             ]
 
-        return statistics.mean(scores)
+        return statistics.mean(scores) * 100
 
 
 meteor = SampleLevelMetric(
     metric_name="meteor",
-    sample_level_fn=METEOR().compute,
+    higher_is_better=True,
     category=MetricCategory.GENERATIVE,
     use_case=MetricUseCase.TRANSLATION,
+    sample_level_fn=METEOR().compute,
     corpus_level_fn=statistics.mean,
-    higher_is_better=True,
 )
 
+
 # EVALS WITH SUBSET
 # This is how you create a subset task (like MMLU), which has several subset
 # each being its own evaluation task.
@@ -517,14 +524,9 @@ def prompt_fn(line: dict, task_name: str = None):
 bleurt_large = get_bleurt(model_size="large", seq_len=512, device=device)
 
 # There are also reference-free models (e.g., Unbabel/wmt22-cometkiwi-da), but since we have reference gold labels, we use the reference-based models.
-comet_wmt22_da = get_comet(
-    model_name="Unbabel/wmt22-comet-da",
-    batch_size=32,
-    gpus=1,
-    device=device,
-)
-xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=8, gpus=1, device=device)
-xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=8, gpus=1, device=device)
+comet_wmt22_da = get_comet(model_name="Unbabel/wmt22-comet-da", batch_size=32, gpus=1, device=device)
+# xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=8, gpus=1, device=device)
+# xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=8, gpus=1, device=device)
 
 swiss_legal_translation_judge_gpt_4o = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
 
@@ -546,9 +548,9 @@ def __init__(
             hf_subset=level_name,
             hf_filter=None,
             hf_avail_splits=["train", "validation", "test"],
-            evaluation_splits=["test"],  # ["validation", "test"],
+            evaluation_splits=["test"],
             few_shots_split="validation",
-            few_shots_select=None,
+            few_shots_select=None,  # TODO: add few-shot selection
             generation_size=level_config.generation_size,
             metric=[
                 Metrics.bleu,
@@ -559,14 +561,14 @@ def __init__(
                 bert_score,
                 bleurt_large,
                 comet_wmt22_da,
-                xcomet_xl,
-                xcomet_xxl,
+                # xcomet_xl,
+                # xcomet_xxl,
                 swiss_legal_translation_judge_gpt_4o,
                 # Additionally we could consider adding the following open source judge models:
                 # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0
                 # However, these are only fine-tuned on English data and we need multilingual support.
             ],
-            stop_sequence=["\n"],
+            stop_sequence=["\n"],  # TODO: Debug why this is not working for litellm inference
             trust_dataset=True,
         )
 

From 5ba218f8f1f08f073fff0a9d4fdd641ff7c35431 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Tue, 26 Nov 2024 14:04:00 +0100
Subject: [PATCH 16/43] Added few shot examples and did some refactoring.

---
 community_tasks/swiss_legal_evals.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 205fad36b..032880771 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -501,7 +501,7 @@ def create_prompt_fn(level_config: LevelConfig, src_lang: str, target_lang: str)
 
     def prompt_fn(line: dict, task_name: str = None):
         # Following Template A from https://github.com/huggingface/lighteval/pull/389#issuecomment-2471580177
-        custom_query = f"{src_lang.upper()}: {line[src_text_col]}\n{target_lang.upper()}:"
+        custom_query = f"{src_lang.upper()}: {line[src_text_col]}\n{target_lang.upper()}: "
 
         return Doc(
             task_name=task_name,
@@ -536,21 +536,21 @@ def __init__(
         self,
         dataset_config: DatasetConfig,
         level_name: str,
-        src_lang: str,
+        source_lang: str,
         target_lang: str,
     ):
         level_config = dataset_config.subsets[level_name]
         super().__init__(
-            name=f"{dataset_config.name}-{level_name}:{src_lang}-{target_lang}",
+            name=f"{dataset_config.name}-{level_name}:{source_lang}-{target_lang}",
             suite=["community"],
-            prompt_function=create_prompt_fn(level_config, src_lang, target_lang),
+            prompt_function=create_prompt_fn(level_config, source_lang, target_lang),
             hf_repo=dataset_config.hf_repo,
             hf_subset=level_name,
             hf_filter=None,
             hf_avail_splits=["train", "validation", "test"],
             evaluation_splits=["test"],
             few_shots_split="validation",
-            few_shots_select=None,  # TODO: add few-shot selection
+            few_shots_select="sequential",
             generation_size=level_config.generation_size,
             metric=[
                 Metrics.bleu,
@@ -560,16 +560,19 @@ def __init__(
                 meteor,
                 bert_score,
                 bleurt_large,
-                comet_wmt22_da,
+                comet_wmt22_da,  # TODO: debug why this is not saved in the details
                 # xcomet_xl,
                 # xcomet_xxl,
-                swiss_legal_translation_judge_gpt_4o,
+                swiss_legal_translation_judge_gpt_4o,  # TODO: debug why this is not showing up in the results
                 # Additionally we could consider adding the following open source judge models:
                 # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0
                 # However, these are only fine-tuned on English data and we need multilingual support.
             ],
-            stop_sequence=["\n"],  # TODO: Debug why this is not working for litellm inference
+            stop_sequence=[".\n"],  # just "\n" leads to problems for anthropic models
             trust_dataset=True,
+            # Remove the target language in the beginning if it exists: e.g., FR: {translation}
+            # Is only applied to the generative metrics, but also there seems not to be invoked, maybe not passed through?
+            # output_regex=f"(?:{target_lang.upper()}:\s*?)?(.*)",
         )
 
 
@@ -586,12 +589,12 @@ def __init__(
     TranslationTask(
         dataset_config=dataset,
         level_name=subset,
-        src_lang=src_lang,
+        source_lang=source_lang,
         target_lang=target_lang,
     )
     for dataset in DATASETS
     for subset in dataset.subsets
-    for src_lang, target_lang in dataset.translation_pairs
+    for source_lang, target_lang in dataset.translation_pairs
 ]
 
 

From 576b847b02e1f44a72f3c497315da7a04c9a2025 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Tue, 26 Nov 2024 14:48:21 +0100
Subject: [PATCH 17/43] Switched to an own judge class.

---
 community_tasks/swiss_legal_evals.py | 37 +++++++++++++++++++---------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 032880771..22d3421fe 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -46,7 +46,7 @@
 from lighteval.logging.hierarchical_logger import hlog_warn
 from lighteval.metrics.imports.bert_scorer import BERTScorer
 from lighteval.metrics.metrics import Metrics
-from lighteval.metrics.metrics_sample import BertScore, JudgeLLMMixEval
+from lighteval.metrics.metrics_sample import BertScore, JudgeLLM
 from lighteval.metrics.normalizations import remove_braces, remove_braces_and_strip
 from lighteval.metrics.utils.metric_utils import (
     MetricCategory,
@@ -154,6 +154,24 @@ def swiss_legal_translation_judge(question, options, answer, gold):
     ]
 
 
+class JudgeSwissLegalTranslation(JudgeLLM):
+    def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc], **kwargs) -> dict[str, float]:
+        """
+        Compute the score of a generative task using a llm as a judge.
+        """
+        questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]
+        options = [formatted_doc.choices for formatted_doc in formatted_docs]
+        golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
+        predictions = [response[0].result[0] for response in responses]
+
+        scores, _, judgements = self.judge.evaluate_answer_batch(questions, predictions, options, golds)
+        # Exclude the messages (user prompt) because they are too long
+        return [
+            {f"judge_score_{self.short_judge_name}": score * 100, f"judgement_{self.short_judge_name}": judgment}
+            for score, judgment in zip(scores, judgements)
+        ]
+
+
 def get_swiss_legal_translation_judge(judge_model_name: str = "gpt-4o"):
     name = f"swiss_legal_translation_judge_{judge_model_name}"
     return SampleLevelMetricGrouping(
@@ -161,16 +179,13 @@ def get_swiss_legal_translation_judge(judge_model_name: str = "gpt-4o"):
         higher_is_better={name: True},
         category=MetricCategory.LLM_AS_JUDGE,
         use_case=MetricUseCase.TRANSLATION,
-        sample_level_fn=lambda *args, **kwargs: [
-            {k: v * 100 if k == f"judge_score_{judge_model_name}" else v for k, v in score_dict.items()}
-            for score_dict in JudgeLLMMixEval(
-                judge_model_name=judge_model_name,
-                template=swiss_legal_translation_judge,
-                process_judge_response=process_judge_response_freeform_gpt,
-                judge_backend="openai",
-                short_judge_name=judge_model_name,
-            ).compute(*args, **kwargs)
-        ],
+        sample_level_fn=JudgeSwissLegalTranslation(
+            judge_model_name=judge_model_name,
+            template=swiss_legal_translation_judge,
+            process_judge_response=process_judge_response_freeform_gpt,
+            judge_backend="openai",
+            short_judge_name=judge_model_name,
+        ).compute,
         corpus_level_fn={name: statistics.mean},
     )
 

From 41bb59aeb3698f84398231a680be9d5b8c29dcd8 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Tue, 26 Nov 2024 16:48:39 +0100
Subject: [PATCH 18/43] Fixed issue with judge metric not showing up in
 results.

---
 community_tasks/swiss_legal_evals.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 22d3421fe..70aa1e4b8 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -167,13 +167,13 @@ def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[D
         scores, _, judgements = self.judge.evaluate_answer_batch(questions, predictions, options, golds)
         # Exclude the messages (user prompt) because they are too long
         return [
-            {f"judge_score_{self.short_judge_name}": score * 100, f"judgement_{self.short_judge_name}": judgment}
+            {self.short_judge_name: score * 100, f"{self.short_judge_name}_judgment": judgment}
             for score, judgment in zip(scores, judgements)
         ]
 
 
 def get_swiss_legal_translation_judge(judge_model_name: str = "gpt-4o"):
-    name = f"swiss_legal_translation_judge_{judge_model_name}"
+    name = f"slt_judge_{judge_model_name}"
     return SampleLevelMetricGrouping(
         metric_name=[name],
         higher_is_better={name: True},
@@ -184,7 +184,7 @@ def get_swiss_legal_translation_judge(judge_model_name: str = "gpt-4o"):
             template=swiss_legal_translation_judge,
             process_judge_response=process_judge_response_freeform_gpt,
             judge_backend="openai",
-            short_judge_name=judge_model_name,
+            short_judge_name=name,
         ).compute,
         corpus_level_fn={name: statistics.mean},
     )
@@ -536,12 +536,12 @@ def prompt_fn(line: dict, task_name: str = None):
 bert_score = get_bert_score(model_type="xlm-roberta-large", device=device)
 
 # Only take the largest version
-bleurt_large = get_bleurt(model_size="large", seq_len=512, device=device)
+bleurt_large = get_bleurt(model_size="large", seq_len=512, batch_size=64, device=device)
 
 # There are also reference-free models (e.g., Unbabel/wmt22-cometkiwi-da), but since we have reference gold labels, we use the reference-based models.
-comet_wmt22_da = get_comet(model_name="Unbabel/wmt22-comet-da", batch_size=32, gpus=1, device=device)
-# xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=8, gpus=1, device=device)
-# xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=8, gpus=1, device=device)
+comet_wmt22_da = get_comet(model_name="Unbabel/wmt22-comet-da", batch_size=64, gpus=1, device=device)
+xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=16, gpus=1, device=device)
+xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=8, gpus=1, device=device)
 
 swiss_legal_translation_judge_gpt_4o = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
 
@@ -573,12 +573,12 @@ def __init__(
                 Metrics.chrf,
                 Metrics.ter,
                 meteor,
-                bert_score,
+                bert_score,  # TODO: think about allowing parallelization as well if slow
                 bleurt_large,
-                comet_wmt22_da,  # TODO: debug why this is not saved in the details
-                # xcomet_xl,
-                # xcomet_xxl,
-                swiss_legal_translation_judge_gpt_4o,  # TODO: debug why this is not showing up in the results
+                comet_wmt22_da,
+                xcomet_xl,
+                xcomet_xxl,
+                swiss_legal_translation_judge_gpt_4o,
                 # Additionally we could consider adding the following open source judge models:
                 # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0
                 # However, these are only fine-tuned on English data and we need multilingual support.

From d82cd91ac4b1b32f8b9c295e417311cad1027c3c Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Wed, 27 Nov 2024 12:15:45 +0000
Subject: [PATCH 19/43] Fixed issue with evaluation on GPUs.

---
 community_tasks/swiss_legal_evals.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 70aa1e4b8..f370c9b14 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -270,7 +270,7 @@ def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[D
             if any(len(encoding) == self.max_length for encoding in inputs["input_ids"]):
                 hlog_warn(f"Some inputs were truncated to max_length={self.max_length} in BLEURT scoring")
             with torch.no_grad():
-                all_scores.extend(self.model(**inputs)[0].squeeze().tolist())
+                all_scores.extend(self.model(**inputs)[0].squeeze().cpu().tolist())
 
         return [{self.metric_name: score * 100} for score in all_scores]
 
@@ -539,8 +539,8 @@ def prompt_fn(line: dict, task_name: str = None):
 bleurt_large = get_bleurt(model_size="large", seq_len=512, batch_size=64, device=device)
 
 # There are also reference-free models (e.g., Unbabel/wmt22-cometkiwi-da), but since we have reference gold labels, we use the reference-based models.
-comet_wmt22_da = get_comet(model_name="Unbabel/wmt22-comet-da", batch_size=64, gpus=1, device=device)
-xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=16, gpus=1, device=device)
+# comet_wmt22_da = get_comet(model_name="Unbabel/wmt22-comet-da", batch_size=64, gpus=1, device=device)
+# xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=16, gpus=1, device=device)
 xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=8, gpus=1, device=device)
 
 swiss_legal_translation_judge_gpt_4o = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
@@ -568,15 +568,14 @@ def __init__(
             few_shots_select="sequential",
             generation_size=level_config.generation_size,
             metric=[
-                Metrics.bleu,
-                # Metrics.bleu_4,
+                Metrics.bleu,  # Metrics.bleu_4,
                 Metrics.chrf,
                 Metrics.ter,
                 meteor,
                 bert_score,  # TODO: think about allowing parallelization as well if slow
                 bleurt_large,
-                comet_wmt22_da,
-                xcomet_xl,
+                # comet_wmt22_da,
+                # xcomet_xl,
                 xcomet_xxl,
                 swiss_legal_translation_judge_gpt_4o,
                 # Additionally we could consider adding the following open source judge models:

From 1b13d9fcd011c12e2907648158838b42cb74dc00 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Wed, 27 Nov 2024 17:13:52 +0000
Subject: [PATCH 20/43] Speed up metric computation on GPUs.

---
 community_tasks/swiss_legal_evals.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index f370c9b14..f86c563c3 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -61,6 +61,15 @@
 
 device = "cuda" if torch.cuda.is_available() else "cpu"
 
+# Try to optimize CUDA operations
+if device == "cuda":
+    torch.backends.cudnn.benchmark = True  # Enable cudnn auto-tuner
+    torch.backends.cuda.matmul.allow_tf32 = True  # Enable TF32 for faster matrix multiplications
+    # Enable tensor cores if available
+    if torch.cuda.get_device_capability()[0] >= 7:
+        # This will speed up GPU inference, e.g., for COMET and BLEURT
+        torch.set_float32_matmul_precision('medium') 
+
 # CUSTOM METRICS
 
 

From df0f3f02f0aca35cbc01e20af3d239d7f4c586c3 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Wed, 27 Nov 2024 17:15:40 +0000
Subject: [PATCH 21/43] Added more logging.

---
 community_tasks/swiss_legal_evals.py | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index f86c563c3..4cea5ba94 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -43,7 +43,7 @@
 from packaging import version
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
-from lighteval.logging.hierarchical_logger import hlog_warn
+from lighteval.logging.hierarchical_logger import hlog_warn, hlog
 from lighteval.metrics.imports.bert_scorer import BERTScorer
 from lighteval.metrics.metrics import Metrics
 from lighteval.metrics.metrics_sample import BertScore, JudgeLLM
@@ -165,9 +165,7 @@ def swiss_legal_translation_judge(question, options, answer, gold):
 
 class JudgeSwissLegalTranslation(JudgeLLM):
     def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc], **kwargs) -> dict[str, float]:
-        """
-        Compute the score of a generative task using a llm as a judge.
-        """
+        hlog(f"Judging {len(formatted_docs)} samples with {self.short_judge_name}...")
         questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]
         options = [formatted_doc.choices for formatted_doc in formatted_docs]
         golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
@@ -202,7 +200,7 @@ def get_swiss_legal_translation_judge(judge_model_name: str = "gpt-4o"):
 def get_bert_score(model_type: str = "xlm-roberta-large", device: str = "cpu"):
     if device == "mps":
         raise ValueError("MPS is not supported for BERTScore")
-    print(f"Loading BERTScore with model_type={model_type}, and device={device}...")
+    hlog(f"Loading BERTScore with model_type={model_type}, and device={device}...")
     score = BertScore(normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip)
     score.bert_scorer = BERTScorer(
         # We could download the files from here and set the baseline_path ourselves:
@@ -259,6 +257,7 @@ def __init__(
         self.batch_size = batch_size
 
     def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc], **kwargs) -> dict[str, float]:
+        hlog(f"Scoring {len(formatted_docs)} samples with {self.metric_name}...")
         golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
         predictions = [response[0].result[0] for response in responses]
 
@@ -285,7 +284,7 @@ def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[D
 
 
 def get_bleurt(model_size: str = "tiny", seq_len: int = 512, batch_size: int = 32, device: str = "cpu"):
-    print(
+    hlog(
         f"Loading BLEURT with model_size={model_size}, seq_len={seq_len}, batch_size={batch_size}, and device={device}..."
     )
     name = f"bleurt_{model_size}"
@@ -317,6 +316,7 @@ def __init__(
         self.accelerator = accelerator
 
     def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc], **kwargs) -> dict[str, float]:
+        hlog(f"Scoring {len(formatted_docs)} samples with {self.metric_name}...")
         golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
         predictions = [response[0].result[0] for response in responses]
         sources = [kwargs["formatted_doc"].specific["source"] for kwargs["formatted_doc"] in formatted_docs]
@@ -337,8 +337,8 @@ def get_comet(
     batch_size: int = 8,
     gpus: int = 1,
     device: str = "cpu",
-):
-    print(f"Loading COMET with model_name={model_name}, batch_size={batch_size}, gpus={gpus}, and device={device}...")
+):    
+    hlog(f"Loading COMET with model_name={model_name}, batch_size={batch_size}, gpus={gpus}, and device={device}...")
     name = model_name.split("/")[-1]
     return SampleLevelMetricGrouping(
         metric_name=[name],
@@ -541,16 +541,16 @@ def prompt_fn(line: dict, task_name: str = None):
 
     return prompt_fn
 
-
+# INFO: Batch sizes are optimized for an 80GB NVIDIA A100 GPU
 bert_score = get_bert_score(model_type="xlm-roberta-large", device=device)
 
 # Only take the largest version
-bleurt_large = get_bleurt(model_size="large", seq_len=512, batch_size=64, device=device)
+bleurt_large = get_bleurt(model_size="large", seq_len=512, batch_size=256, device=device)
 
 # There are also reference-free models (e.g., Unbabel/wmt22-cometkiwi-da), but since we have reference gold labels, we use the reference-based models.
 # comet_wmt22_da = get_comet(model_name="Unbabel/wmt22-comet-da", batch_size=64, gpus=1, device=device)
-# xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=16, gpus=1, device=device)
-xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=8, gpus=1, device=device)
+# xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=32, gpus=1, device=device)
+xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=16, gpus=1, device=device)
 
 swiss_legal_translation_judge_gpt_4o = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
 
@@ -583,9 +583,7 @@ def __init__(
                 meteor,
                 bert_score,  # TODO: think about allowing parallelization as well if slow
                 bleurt_large,
-                # comet_wmt22_da,
-                # xcomet_xl,
-                xcomet_xxl,
+                xcomet_xxl,  # xcomet_xl, comet_wmt22_da
                 swiss_legal_translation_judge_gpt_4o,
                 # Additionally we could consider adding the following open source judge models:
                 # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0

From 980c25712eeff0276922a1f805fe941c3a669ce9 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Thu, 28 Nov 2024 14:53:47 +0100
Subject: [PATCH 22/43] Switched to sample level scores for faster evaluation.

---
 community_tasks/swiss_legal_evals.py | 190 +++++++++++++++++++++++----
 1 file changed, 163 insertions(+), 27 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 4cea5ba94..42f515746 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -40,10 +40,12 @@
 from comet import download_model, load_from_checkpoint
 from nltk import word_tokenize
 from nltk.translate import meteor_score
+from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
+from nltk.translate.chrf_score import sentence_chrf
 from packaging import version
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
-from lighteval.logging.hierarchical_logger import hlog_warn, hlog
+from lighteval.logging.hierarchical_logger import hlog, hlog_warn
 from lighteval.metrics.imports.bert_scorer import BERTScorer
 from lighteval.metrics.metrics import Metrics
 from lighteval.metrics.metrics_sample import BertScore, JudgeLLM
@@ -64,11 +66,12 @@
 # Try to optimize CUDA operations
 if device == "cuda":
     torch.backends.cudnn.benchmark = True  # Enable cudnn auto-tuner
-    torch.backends.cuda.matmul.allow_tf32 = True  # Enable TF32 for faster matrix multiplications
+    # Enable TF32 for faster matrix multiplications
+    torch.backends.cuda.matmul.allow_tf32 = True
     # Enable tensor cores if available
     if torch.cuda.get_device_capability()[0] >= 7:
         # This will speed up GPU inference, e.g., for COMET and BLEURT
-        torch.set_float32_matmul_precision('medium') 
+        torch.set_float32_matmul_precision("medium")
 
 # CUSTOM METRICS
 
@@ -164,7 +167,13 @@ def swiss_legal_translation_judge(question, options, answer, gold):
 
 
 class JudgeSwissLegalTranslation(JudgeLLM):
-    def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc], **kwargs) -> dict[str, float]:
+    def compute(
+        self,
+        sample_ids: list[str],
+        responses: list,
+        formatted_docs: list[Doc],
+        **kwargs,
+    ) -> dict[str, float]:
         hlog(f"Judging {len(formatted_docs)} samples with {self.short_judge_name}...")
         questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]
         options = [formatted_doc.choices for formatted_doc in formatted_docs]
@@ -174,7 +183,10 @@ def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[D
         scores, _, judgements = self.judge.evaluate_answer_batch(questions, predictions, options, golds)
         # Exclude the messages (user prompt) because they are too long
         return [
-            {self.short_judge_name: score * 100, f"{self.short_judge_name}_judgment": judgment}
+            {
+                self.short_judge_name: score * 100,
+                f"{self.short_judge_name}_judgment": judgment,
+            }
             for score, judgment in zip(scores, judgements)
         ]
 
@@ -197,6 +209,9 @@ def get_swiss_legal_translation_judge(judge_model_name: str = "gpt-4o"):
     )
 
 
+swiss_legal_translation_judge_gpt_4o = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
+
+
 def get_bert_score(model_type: str = "xlm-roberta-large", device: str = "cpu"):
     if device == "mps":
         raise ValueError("MPS is not supported for BERTScore")
@@ -230,6 +245,10 @@ def get_bert_score(model_type: str = "xlm-roberta-large", device: str = "cpu"):
     )
 
 
+# INFO: Batch sizes are optimized for an 80GB NVIDIA A100 GPU
+bert_score = get_bert_score(model_type="xlm-roberta-large", device=device)
+
+
 class BLEURT:
     def __init__(
         self,
@@ -256,7 +275,13 @@ def __init__(
         self.max_length = seq_len
         self.batch_size = batch_size
 
-    def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc], **kwargs) -> dict[str, float]:
+    def compute(
+        self,
+        sample_ids: list[str],
+        responses: list,
+        formatted_docs: list[Doc],
+        **kwargs,
+    ) -> dict[str, float]:
         hlog(f"Scoring {len(formatted_docs)} samples with {self.metric_name}...")
         golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
         predictions = [response[0].result[0] for response in responses]
@@ -283,7 +308,12 @@ def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[D
         return [{self.metric_name: score * 100} for score in all_scores]
 
 
-def get_bleurt(model_size: str = "tiny", seq_len: int = 512, batch_size: int = 32, device: str = "cpu"):
+def get_bleurt(
+    model_size: str = "tiny",
+    seq_len: int = 512,
+    batch_size: int = 32,
+    device: str = "cpu",
+):
     hlog(
         f"Loading BLEURT with model_size={model_size}, seq_len={seq_len}, batch_size={batch_size}, and device={device}..."
     )
@@ -298,6 +328,10 @@ def get_bleurt(model_size: str = "tiny", seq_len: int = 512, batch_size: int = 3
     )
 
 
+# Only take the largest version
+bleurt_large = get_bleurt(model_size="large", seq_len=512, batch_size=256, device=device)
+
+
 class COMET:
     def __init__(
         self,
@@ -315,7 +349,13 @@ def __init__(
         self.gpus = gpus
         self.accelerator = accelerator
 
-    def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc], **kwargs) -> dict[str, float]:
+    def compute(
+        self,
+        sample_ids: list[str],
+        responses: list,
+        formatted_docs: list[Doc],
+        **kwargs,
+    ) -> dict[str, float]:
         hlog(f"Scoring {len(formatted_docs)} samples with {self.metric_name}...")
         golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
         predictions = [response[0].result[0] for response in responses]
@@ -337,7 +377,7 @@ def get_comet(
     batch_size: int = 8,
     gpus: int = 1,
     device: str = "cpu",
-):    
+):
     hlog(f"Loading COMET with model_name={model_name}, batch_size={batch_size}, gpus={gpus}, and device={device}...")
     name = model_name.split("/")[-1]
     return SampleLevelMetricGrouping(
@@ -355,6 +395,12 @@ def get_comet(
     )
 
 
+# There are also reference-free models (e.g., Unbabel/wmt22-cometkiwi-da), but since we have reference gold labels, we use the reference-based models.
+# comet_wmt22_da = get_comet(model_name="Unbabel/wmt22-comet-da", batch_size=64, gpus=1, device=device)
+# xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=32, gpus=1, device=device)
+xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=16, gpus=1, device=device)
+
+
 class METEOR:
     def __init__(self, alpha=0.9, beta=3, gamma=0.5):
         self.alpha = alpha
@@ -403,6 +449,110 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
 )
 
 
+class BLEU:
+    def __init__(
+        self,
+        weights=(0.25, 0.25, 0.25, 0.25),
+        smoothing_function=None,
+        auto_reweigh=False,
+    ):
+        """
+        Initialize BLEU scorer with specified n-gram weights.
+        Default weights are for BLEU-4 (equal weights for 1-4 grams).
+
+        Args:
+            weights: Tuple of weights for unigrams through 4-grams
+            smoothing_function: Optional smoothing function for BLEU computation
+            auto_reweigh: Whether to automatically reweigh the scores based on reference length
+        """
+        self.weights = weights
+        self.smoothing_function = smoothing_function or SmoothingFunction().method1
+        self.auto_reweigh = auto_reweigh
+
+        # Ensure NLTK data is downloaded
+        nltk.download("punkt")
+
+    def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
+        """
+        Compute BLEU score for a list of predictions against their references.
+
+        Args:
+            golds: List of reference strings
+            predictions: List of prediction strings
+
+        Returns:
+            Mean BLEU score scaled to 0-100
+        """
+        scores = []
+        for ref, pred in zip(golds, predictions):
+            # Tokenize the reference and prediction
+            reference = [word_tokenize(ref)]
+            hypothesis = word_tokenize(pred)
+
+            # Calculate BLEU score for this pair
+            score = sentence_bleu(
+                references=reference,
+                hypothesis=hypothesis,
+                weights=self.weights,
+                smoothing_function=self.smoothing_function,
+                auto_reweigh=self.auto_reweigh,
+            )
+            scores.append(score)
+
+        return statistics.mean(scores) * 100
+
+
+bleu = SampleLevelMetric(
+    metric_name="bleu",
+    higher_is_better=True,
+    category=MetricCategory.GENERATIVE,
+    use_case=MetricUseCase.TRANSLATION,
+    sample_level_fn=BLEU().compute,
+    corpus_level_fn=statistics.mean,
+)
+
+
+class CHRF:
+    def __init__(self, beta: float = 3.0, max_len: int = 6, min_len: int = 1):
+        """
+        Initialize chrF scorer with specified parameters.
+        beta: Weight of recall vs precision (default: 3.0)
+        max_len: Maximum n-gram order (default: 6)
+        min_len: Minimum n-gram order (default: 1)
+        """
+        self.beta = beta
+        self.max_len = max_len
+        self.min_len = min_len
+
+    def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
+        """
+        Compute chrF score for a list of predictions against their references.
+        """
+        scores = []
+        for ref, pred in zip(golds, predictions):
+            score = sentence_chrf(
+                ref,
+                pred,
+                min_len=self.min_len,
+                max_len=self.max_len,
+                beta=self.beta,
+                ignore_whitespace=True,
+            )
+            scores.append(score)
+
+        return statistics.mean(scores) * 100
+
+
+chrf = SampleLevelMetric(
+    metric_name="chrf",
+    higher_is_better=True,
+    category=MetricCategory.GENERATIVE,
+    use_case=MetricUseCase.TRANSLATION,
+    sample_level_fn=CHRF().compute,
+    corpus_level_fn=statistics.mean,
+)
+
+
 # EVALS WITH SUBSET
 # This is how you create a subset task (like MMLU), which has several subset
 # each being its own evaluation task.
@@ -541,19 +691,6 @@ def prompt_fn(line: dict, task_name: str = None):
 
     return prompt_fn
 
-# INFO: Batch sizes are optimized for an 80GB NVIDIA A100 GPU
-bert_score = get_bert_score(model_type="xlm-roberta-large", device=device)
-
-# Only take the largest version
-bleurt_large = get_bleurt(model_size="large", seq_len=512, batch_size=256, device=device)
-
-# There are also reference-free models (e.g., Unbabel/wmt22-cometkiwi-da), but since we have reference gold labels, we use the reference-based models.
-# comet_wmt22_da = get_comet(model_name="Unbabel/wmt22-comet-da", batch_size=64, gpus=1, device=device)
-# xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=32, gpus=1, device=device)
-xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=16, gpus=1, device=device)
-
-swiss_legal_translation_judge_gpt_4o = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
-
 
 class TranslationTask(LightevalTaskConfig):
     def __init__(
@@ -577,13 +714,12 @@ def __init__(
             few_shots_select="sequential",
             generation_size=level_config.generation_size,
             metric=[
-                Metrics.bleu,  # Metrics.bleu_4,
-                Metrics.chrf,
-                Metrics.ter,
+                bleu,  # Use sample level BLEU for faster evaluation
+                chrf,  # Use sample level chrF for faster evaluation
                 meteor,
-                bert_score,  # TODO: think about allowing parallelization as well if slow
+                bert_score,
                 bleurt_large,
-                xcomet_xxl,  # xcomet_xl, comet_wmt22_da
+                xcomet_xxl,  # Just use one, disregarding xcomet_xl, comet_wmt22_da
                 swiss_legal_translation_judge_gpt_4o,
                 # Additionally we could consider adding the following open source judge models:
                 # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0

From 9a60dc0faf8aca6706a7c110bd8898564aed3640 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Fri, 29 Nov 2024 17:42:38 +0100
Subject: [PATCH 23/43] Added rescale_with_baseline for BERTScore for better
 differentiation.

---
 community_tasks/swiss_legal_evals.py | 37 +++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 42f515746..41f017ba4 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -32,10 +32,12 @@
 """
 
 import importlib.metadata as importlib_metadata
+import os
 import statistics
 from dataclasses import dataclass
 
 import nltk
+import requests
 import torch
 from comet import download_model, load_from_checkpoint
 from nltk import word_tokenize
@@ -212,21 +214,42 @@ def get_swiss_legal_translation_judge(judge_model_name: str = "gpt-4o"):
 swiss_legal_translation_judge_gpt_4o = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
 
 
-def get_bert_score(model_type: str = "xlm-roberta-large", device: str = "cpu"):
+def get_bert_score(language: str, num_layers: int = 24, model_type: str = "xlm-roberta-large", device: str = "cpu"):
     if device == "mps":
         raise ValueError("MPS is not supported for BERTScore")
-    hlog(f"Loading BERTScore with model_type={model_type}, and device={device}...")
+    hlog(
+        f"Loading BERTScore with lang={language}, num_layers={num_layers}, model_type={model_type}, and device={device}..."
+    )
     score = BertScore(normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip)
     score.bert_scorer = BERTScorer(
         # We could download the files from here and set the baseline_path ourselves:
         # https://github.com/Tiiiger/bert_score/tree/master/bert_score/rescale_baseline
         model_type=model_type,
-        lang=None,  # Needs to be set if rescale_with_baseline is True
-        rescale_with_baseline=False,
+        lang=language,  # Needs to be set if rescale_with_baseline is True
+        num_layers=num_layers,  # Needs to be set if rescale_with_baseline is True
+        rescale_with_baseline=True,
         baseline_path=None,
         device=device,
     )
 
+    if language == "rm":
+        language = "it"
+        hlog_warn("There is no BERTScore baseline file for Rumantsch, using Italian instead.")
+
+    # Create directory structure if it doesn't exist
+    os.makedirs(os.path.dirname(score.bert_scorer.baseline_path), exist_ok=True)
+
+    # Download the baseline file if it doesn't exist
+    if not os.path.exists(score.bert_scorer.baseline_path):
+        raw_url = f"https://raw.githubusercontent.com/Tiiiger/bert_score/master/bert_score/rescale_baseline/{language}/{model_type}.tsv"
+        hlog(f"Downloading BERTScore baseline file from {raw_url}")
+        response = requests.get(raw_url)
+        if response.status_code == 200:
+            with open(score.bert_scorer.baseline_path, "wb") as f:
+                f.write(response.content)
+        else:
+            raise RuntimeError(f"Failed to download baseline file from {raw_url}")
+
     return SampleLevelMetricGrouping(
         metric_name=["BERTScore-P", "BERTScore-R", "BERTScore-F"],
         higher_is_better={
@@ -245,10 +268,6 @@ def get_bert_score(model_type: str = "xlm-roberta-large", device: str = "cpu"):
     )
 
 
-# INFO: Batch sizes are optimized for an 80GB NVIDIA A100 GPU
-bert_score = get_bert_score(model_type="xlm-roberta-large", device=device)
-
-
 class BLEURT:
     def __init__(
         self,
@@ -717,7 +736,7 @@ def __init__(
                 bleu,  # Use sample level BLEU for faster evaluation
                 chrf,  # Use sample level chrF for faster evaluation
                 meteor,
-                bert_score,
+                get_bert_score(language=target_lang, model_type="xlm-roberta-large", device=device),
                 bleurt_large,
                 xcomet_xxl,  # Just use one, disregarding xcomet_xl, comet_wmt22_da
                 swiss_legal_translation_judge_gpt_4o,

From 819b949cb14e56121d22f5643681e7cd96330864 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Mon, 2 Dec 2024 11:41:28 +0100
Subject: [PATCH 24/43] Adapted metrics.

---
 community_tasks/swiss_legal_evals.py | 39 +++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 41f017ba4..e1d35085b 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -268,6 +268,19 @@ def get_bert_score(language: str, num_layers: int = 24, model_type: str = "xlm-r
     )
 
 
+# Create BERTScore metrics for each language
+
+
+bert_scores = {
+    lang: get_bert_score(
+        language=lang,
+        model_type="xlm-roberta-large",
+        device=device,
+    )
+    for lang in ["de", "fr", "it", "rm", "en"]
+}
+
+
 class BLEURT:
     def __init__(
         self,
@@ -347,7 +360,6 @@ def get_bleurt(
     )
 
 
-# Only take the largest version
 bleurt_large = get_bleurt(model_size="large", seq_len=512, batch_size=256, device=device)
 
 
@@ -521,8 +533,8 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         return statistics.mean(scores) * 100
 
 
-bleu = SampleLevelMetric(
-    metric_name="bleu",
+bleu_sentence = SampleLevelMetric(
+    metric_name="bleu_sentence",
     higher_is_better=True,
     category=MetricCategory.GENERATIVE,
     use_case=MetricUseCase.TRANSLATION,
@@ -562,8 +574,8 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         return statistics.mean(scores) * 100
 
 
-chrf = SampleLevelMetric(
-    metric_name="chrf",
+chrf_sentence = SampleLevelMetric(
+    metric_name="chrf_sentence",
     higher_is_better=True,
     category=MetricCategory.GENERATIVE,
     use_case=MetricUseCase.TRANSLATION,
@@ -733,12 +745,19 @@ def __init__(
             few_shots_select="sequential",
             generation_size=level_config.generation_size,
             metric=[
-                bleu,  # Use sample level BLEU for faster evaluation
-                chrf,  # Use sample level chrF for faster evaluation
+                # ===== Lexical metrics =====
+                # Metrics.ter,  # TER is a corpus level metric that is very slow in bootstrapping
+                bleu_sentence,  # Use sample level BLEU for faster evaluation
+                Metrics.bleu,  # Disable this if it is too slow
+                Metrics.bleu_1,
+                Metrics.bleu_4,
+                Metrics.chrf,  # Disable this if it is too slow
+                chrf_sentence,  # Use sample level chrF for faster evaluation
                 meteor,
-                get_bert_score(language=target_lang, model_type="xlm-roberta-large", device=device),
-                bleurt_large,
-                xcomet_xxl,  # Just use one, disregarding xcomet_xl, comet_wmt22_da
+                # ===== Model-based metrics =====
+                bert_scores[target_lang],
+                bleurt_large,  # Only take the largest version, disregarding base and tiny
+                xcomet_xxl,  # Only take the largest version, disregarding xcomet_xl, comet_wmt22_da
                 swiss_legal_translation_judge_gpt_4o,
                 # Additionally we could consider adding the following open source judge models:
                 # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0

From e758316f7b906254936098e272f57575b04444ef Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Mon, 2 Dec 2024 16:11:37 +0100
Subject: [PATCH 25/43] Switched to sacrebleu implementation for sentence level
 translation metrics.

---
 community_tasks/swiss_legal_evals.py | 99 ++++++++++------------------
 1 file changed, 33 insertions(+), 66 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index e1d35085b..ca10037b9 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -42,9 +42,8 @@
 from comet import download_model, load_from_checkpoint
 from nltk import word_tokenize
 from nltk.translate import meteor_score
-from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
-from nltk.translate.chrf_score import sentence_chrf
 from packaging import version
+from sacrebleu import sentence_bleu, sentence_chrf, sentence_ter
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
 from lighteval.logging.hierarchical_logger import hlog, hlog_warn
@@ -481,28 +480,6 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
 
 
 class BLEU:
-    def __init__(
-        self,
-        weights=(0.25, 0.25, 0.25, 0.25),
-        smoothing_function=None,
-        auto_reweigh=False,
-    ):
-        """
-        Initialize BLEU scorer with specified n-gram weights.
-        Default weights are for BLEU-4 (equal weights for 1-4 grams).
-
-        Args:
-            weights: Tuple of weights for unigrams through 4-grams
-            smoothing_function: Optional smoothing function for BLEU computation
-            auto_reweigh: Whether to automatically reweigh the scores based on reference length
-        """
-        self.weights = weights
-        self.smoothing_function = smoothing_function or SmoothingFunction().method1
-        self.auto_reweigh = auto_reweigh
-
-        # Ensure NLTK data is downloaded
-        nltk.download("punkt")
-
     def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         """
         Compute BLEU score for a list of predictions against their references.
@@ -516,19 +493,7 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         """
         scores = []
         for ref, pred in zip(golds, predictions):
-            # Tokenize the reference and prediction
-            reference = [word_tokenize(ref)]
-            hypothesis = word_tokenize(pred)
-
-            # Calculate BLEU score for this pair
-            score = sentence_bleu(
-                references=reference,
-                hypothesis=hypothesis,
-                weights=self.weights,
-                smoothing_function=self.smoothing_function,
-                auto_reweigh=self.auto_reweigh,
-            )
-            scores.append(score)
+            scores.append(sentence_bleu(pred, [ref]).score)
 
         return statistics.mean(scores) * 100
 
@@ -544,32 +509,13 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
 
 
 class CHRF:
-    def __init__(self, beta: float = 3.0, max_len: int = 6, min_len: int = 1):
-        """
-        Initialize chrF scorer with specified parameters.
-        beta: Weight of recall vs precision (default: 3.0)
-        max_len: Maximum n-gram order (default: 6)
-        min_len: Minimum n-gram order (default: 1)
-        """
-        self.beta = beta
-        self.max_len = max_len
-        self.min_len = min_len
-
     def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         """
         Compute chrF score for a list of predictions against their references.
         """
         scores = []
         for ref, pred in zip(golds, predictions):
-            score = sentence_chrf(
-                ref,
-                pred,
-                min_len=self.min_len,
-                max_len=self.max_len,
-                beta=self.beta,
-                ignore_whitespace=True,
-            )
-            scores.append(score)
+            scores.append(sentence_chrf(pred, [ref]).score)
 
         return statistics.mean(scores) * 100
 
@@ -584,9 +530,29 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
 )
 
 
+class TER:
+    def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
+        """
+        Compute TER score for a list of predictions against their references.
+        """
+        scores = []
+        for ref, pred in zip(golds, predictions):
+            scores.append(sentence_ter(pred, [ref]).score)
+
+        return statistics.mean(scores) * 100
+
+
+ter_sentence = SampleLevelMetric(
+    metric_name="ter_sentence",
+    higher_is_better=False,
+    category=MetricCategory.GENERATIVE,
+    use_case=MetricUseCase.TRANSLATION,
+    sample_level_fn=TER().compute,
+    corpus_level_fn=statistics.mean,
+)
+
+
 # EVALS WITH SUBSET
-# This is how you create a subset task (like MMLU), which has several subset
-# each being its own evaluation task.
 
 
 def create_translation_pairs(langs_list: list) -> list[tuple]:
@@ -746,13 +712,14 @@ def __init__(
             generation_size=level_config.generation_size,
             metric=[
                 # ===== Lexical metrics =====
-                # Metrics.ter,  # TER is a corpus level metric that is very slow in bootstrapping
-                bleu_sentence,  # Use sample level BLEU for faster evaluation
-                Metrics.bleu,  # Disable this if it is too slow
-                Metrics.bleu_1,
-                Metrics.bleu_4,
-                Metrics.chrf,  # Disable this if it is too slow
-                chrf_sentence,  # Use sample level chrF for faster evaluation
+                # Corpus level metrics
+                Metrics.bleu,
+                Metrics.chrf,
+                # Metrics.ter,  # TER often hangs for a while and takes more than 10 minutes to compute
+                # Sample level metrics
+                bleu_sentence,
+                chrf_sentence,
+                ter_sentence,
                 meteor,
                 # ===== Model-based metrics =====
                 bert_scores[target_lang],

From d08163fa2e44c5e536bb763361f086a63e76aac5 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Wed, 4 Dec 2024 17:25:46 +0100
Subject: [PATCH 26/43] Added more stop sequences.

---
 community_tasks/swiss_legal_evals.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index ca10037b9..f4ecfe9ee 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -730,7 +730,7 @@ def __init__(
                 # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0
                 # However, these are only fine-tuned on English data and we need multilingual support.
             ],
-            stop_sequence=[".\n"],  # just "\n" leads to problems for anthropic models
+            stop_sequence=[".\n", "\n", "</s>"],  # just "\n" leads to problems for anthropic models
             trust_dataset=True,
             # Remove the target language in the beginning if it exists: e.g., FR: {translation}
             # Is only applied to the generative metrics, but also there seems not to be invoked, maybe not passed through?

From 86c67bc3f237b0bccb37b25fcd0a9c404f600186 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Thu, 5 Dec 2024 10:27:30 +0100
Subject: [PATCH 27/43] Made stop_sequence level specific.

---
 community_tasks/swiss_legal_evals.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index f4ecfe9ee..caee42e44 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -579,6 +579,7 @@ class LevelConfig:
     text_col_name: str
     metadata_cols: list[str]
     generation_size: int
+    stop_sequence: list[str]  # just "\n" leads to problems for anthropic models, maybe we need a special case there
 
 
 @dataclass
@@ -603,18 +604,21 @@ def __post_init__(self):
             text_col_name="bgeText",
             metadata_cols=["bge"],
             generation_size=2048,
+            stop_sequence=["</s>", "\n\n"],
         ),
         "regeste_level": LevelConfig(
             name="regeste_level",
             text_col_name="regesteText",
             metadata_cols=["bge"],
             generation_size=512,
+            stop_sequence=["</s>", "\n\n"],
         ),
         "text_level": LevelConfig(
             name="text_level",
             text_col_name="text",
             metadata_cols=["bge"],
             generation_size=256,
+            stop_sequence=["</s>", ".\n", "\n"],
         ),
     },
 )
@@ -630,18 +634,21 @@ def __post_init__(self):
             text_col_name="lawText",
             metadata_cols=["rsNr"],
             generation_size=16384,
+            stop_sequence=["</s>", "\n\n"],
         ),
         "article_level": LevelConfig(
             name="article_level",
             text_col_name="artText",
             metadata_cols=["rsNr"],
             generation_size=1024,
+            stop_sequence=["</s>", "\n\n"],
         ),
         "paragraph_level": LevelConfig(
             name="paragraph_level",
             text_col_name="parText",
             metadata_cols=["rsNr"],
             generation_size=256,
+            stop_sequence=["</s>", ".\n", "\n"],
         ),
     },
 )
@@ -657,6 +664,7 @@ def __post_init__(self):
             text_col_name="text",
             metadata_cols=["filename"],
             generation_size=1024,
+            stop_sequence=["</s>", "\n\n"],
         )
     },
 )
@@ -730,7 +738,7 @@ def __init__(
                 # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0
                 # However, these are only fine-tuned on English data and we need multilingual support.
             ],
-            stop_sequence=[".\n", "\n", "</s>"],  # just "\n" leads to problems for anthropic models
+            stop_sequence=level_config.stop_sequence,
             trust_dataset=True,
             # Remove the target language in the beginning if it exists: e.g., FR: {translation}
             # Is only applied to the generative metrics, but also there seems not to be invoked, maybe not passed through?

From f10994551a2c36550dde0f7fcd4c0b722f66038c Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Fri, 6 Dec 2024 10:00:25 +0100
Subject: [PATCH 28/43] Added gemba metric.

---
 community_tasks/swiss_legal_evals.py | 56 ++++++++++++++++++++++++++--
 1 file changed, 53 insertions(+), 3 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index caee42e44..f690fbea0 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -40,6 +40,7 @@
 import requests
 import torch
 from comet import download_model, load_from_checkpoint
+from gemba import get_gemba_scores
 from nltk import word_tokenize
 from nltk.translate import meteor_score
 from packaging import version
@@ -213,6 +214,52 @@ def get_swiss_legal_translation_judge(judge_model_name: str = "gpt-4o"):
 swiss_legal_translation_judge_gpt_4o = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
 
 
+class GEMBA:
+    def __init__(self, method: str = "GEMBA-MQM_norm", model: str = "gpt-4o"):
+        self.method = method
+        self.model = model
+        self.name = f"{method.split('_')[0]}_{model}"
+
+    def compute(
+        self,
+        sample_ids: list[str],
+        responses: list,
+        formatted_docs: list[Doc],
+        **kwargs,
+    ) -> dict[str, float]:
+        hlog(f"Judging {len(formatted_docs)} samples with {self.name}...")
+        source_langs = [formatted_doc.specific["source_lang"] for formatted_doc in formatted_docs]
+        target_langs = [formatted_doc.specific["target_lang"] for formatted_doc in formatted_docs]
+        # There should be only one language each in the batch
+        assert len(set(source_langs)) == len(set(target_langs)) == 1
+        sources = [formatted_doc.specific["source"] for formatted_doc in formatted_docs]
+        predictions = [response[0].result[0] for response in responses]
+
+        answers, errors = get_gemba_scores(
+            sources, predictions, source_langs[0], target_langs[0], method=self.method, model=self.model
+        )
+
+        # Convert defaultdict to dict
+        errors = [[{key: value} for key, value in error.items()] for error in errors]
+
+        return [{self.name: answer, f"{self.name}_errors": error} for answer, error in zip(answers, errors)]
+
+
+def get_gemba_judge(method: str = "GEMBA-MQM_norm", model: str = "gpt-4o"):
+    name = f"{method.split('_')[0]}_{model}"
+    return SampleLevelMetricGrouping(
+        metric_name=[name],
+        higher_is_better={name: True},
+        category=MetricCategory.LLM_AS_JUDGE,
+        use_case=MetricUseCase.TRANSLATION,
+        sample_level_fn=GEMBA(method=method, model=model).compute,
+        corpus_level_fn={name: statistics.mean},
+    )
+
+
+gemba_mqm_gpt_4o = get_gemba_judge(method="GEMBA-MQM_norm", model="gpt-4o")
+
+
 def get_bert_score(language: str, num_layers: int = 24, model_type: str = "xlm-roberta-large", device: str = "cpu"):
     if device == "mps":
         raise ValueError("MPS is not supported for BERTScore")
@@ -670,17 +717,17 @@ def __post_init__(self):
 )
 
 
-def create_prompt_fn(level_config: LevelConfig, src_lang: str, target_lang: str):
+def create_prompt_fn(level_config: LevelConfig, source_lang: str, target_lang: str):
     """
     Create a prompt function for a given level configuration.
     """
     text_col = level_config.text_col_name
-    src_text_col = f"{src_lang}_{text_col}"
+    src_text_col = f"{source_lang}_{text_col}"
     target_text_col = f"{target_lang}_{text_col}"
 
     def prompt_fn(line: dict, task_name: str = None):
         # Following Template A from https://github.com/huggingface/lighteval/pull/389#issuecomment-2471580177
-        custom_query = f"{src_lang.upper()}: {line[src_text_col]}\n{target_lang.upper()}: "
+        custom_query = f"{source_lang.upper()}: {line[src_text_col]}\n{target_lang.upper()}: "
 
         return Doc(
             task_name=task_name,
@@ -691,6 +738,8 @@ def prompt_fn(line: dict, task_name: str = None):
                 **{col: line[col] for col in level_config.metadata_cols},
                 "question": custom_query,
                 "source": line[src_text_col],
+                "source_lang": source_lang,
+                "target_lang": target_lang,
             },
         )
 
@@ -733,6 +782,7 @@ def __init__(
                 bert_scores[target_lang],
                 bleurt_large,  # Only take the largest version, disregarding base and tiny
                 xcomet_xxl,  # Only take the largest version, disregarding xcomet_xl, comet_wmt22_da
+                gemba_mqm_gpt_4o,
                 swiss_legal_translation_judge_gpt_4o,
                 # Additionally we could consider adding the following open source judge models:
                 # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0

From f357176e8d8513dcdeadf36d50fb11d9abd7dd97 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Mon, 9 Dec 2024 15:39:31 +0100
Subject: [PATCH 29/43] Updated logging.

---
 community_tasks/swiss_legal_evals.py | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index f690fbea0..f5ae79953 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -30,8 +30,8 @@
 
 Author: Joel Niklaus
 """
-
 import importlib.metadata as importlib_metadata
+import logging
 import os
 import statistics
 from dataclasses import dataclass
@@ -47,7 +47,6 @@
 from sacrebleu import sentence_bleu, sentence_chrf, sentence_ter
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
-from lighteval.logging.hierarchical_logger import hlog, hlog_warn
 from lighteval.metrics.imports.bert_scorer import BERTScorer
 from lighteval.metrics.metrics import Metrics
 from lighteval.metrics.metrics_sample import BertScore, JudgeLLM
@@ -63,6 +62,9 @@
 from lighteval.tasks.requests import Doc
 
 
+logger = logging.getLogger(__name__)
+
+
 device = "cuda" if torch.cuda.is_available() else "cpu"
 
 # Try to optimize CUDA operations
@@ -176,7 +178,7 @@ def compute(
         formatted_docs: list[Doc],
         **kwargs,
     ) -> dict[str, float]:
-        hlog(f"Judging {len(formatted_docs)} samples with {self.short_judge_name}...")
+        logger.info(f"Judging {len(formatted_docs)} samples with {self.short_judge_name}...")
         questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]
         options = [formatted_doc.choices for formatted_doc in formatted_docs]
         golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
@@ -227,7 +229,7 @@ def compute(
         formatted_docs: list[Doc],
         **kwargs,
     ) -> dict[str, float]:
-        hlog(f"Judging {len(formatted_docs)} samples with {self.name}...")
+        logger.info(f"Judging {len(formatted_docs)} samples with {self.name}...")
         source_langs = [formatted_doc.specific["source_lang"] for formatted_doc in formatted_docs]
         target_langs = [formatted_doc.specific["target_lang"] for formatted_doc in formatted_docs]
         # There should be only one language each in the batch
@@ -263,7 +265,7 @@ def get_gemba_judge(method: str = "GEMBA-MQM_norm", model: str = "gpt-4o"):
 def get_bert_score(language: str, num_layers: int = 24, model_type: str = "xlm-roberta-large", device: str = "cpu"):
     if device == "mps":
         raise ValueError("MPS is not supported for BERTScore")
-    hlog(
+    logger.info(
         f"Loading BERTScore with lang={language}, num_layers={num_layers}, model_type={model_type}, and device={device}..."
     )
     score = BertScore(normalize_gold=remove_braces, normalize_pred=remove_braces_and_strip)
@@ -280,7 +282,7 @@ def get_bert_score(language: str, num_layers: int = 24, model_type: str = "xlm-r
 
     if language == "rm":
         language = "it"
-        hlog_warn("There is no BERTScore baseline file for Rumantsch, using Italian instead.")
+        logger.warning("There is no BERTScore baseline file for Rumantsch, using Italian instead.")
 
     # Create directory structure if it doesn't exist
     os.makedirs(os.path.dirname(score.bert_scorer.baseline_path), exist_ok=True)
@@ -288,7 +290,7 @@ def get_bert_score(language: str, num_layers: int = 24, model_type: str = "xlm-r
     # Download the baseline file if it doesn't exist
     if not os.path.exists(score.bert_scorer.baseline_path):
         raw_url = f"https://raw.githubusercontent.com/Tiiiger/bert_score/master/bert_score/rescale_baseline/{language}/{model_type}.tsv"
-        hlog(f"Downloading BERTScore baseline file from {raw_url}")
+        logger.info(f"Downloading BERTScore baseline file from {raw_url}")
         response = requests.get(raw_url)
         if response.status_code == 200:
             with open(score.bert_scorer.baseline_path, "wb") as f:
@@ -360,7 +362,7 @@ def compute(
         formatted_docs: list[Doc],
         **kwargs,
     ) -> dict[str, float]:
-        hlog(f"Scoring {len(formatted_docs)} samples with {self.metric_name}...")
+        logger.info(f"Scoring {len(formatted_docs)} samples with {self.metric_name}...")
         golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
         predictions = [response[0].result[0] for response in responses]
 
@@ -379,7 +381,7 @@ def compute(
             )
             inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
             if any(len(encoding) == self.max_length for encoding in inputs["input_ids"]):
-                hlog_warn(f"Some inputs were truncated to max_length={self.max_length} in BLEURT scoring")
+                logger.warning(f"Some inputs were truncated to max_length={self.max_length} in BLEURT scoring")
             with torch.no_grad():
                 all_scores.extend(self.model(**inputs)[0].squeeze().cpu().tolist())
 
@@ -392,7 +394,7 @@ def get_bleurt(
     batch_size: int = 32,
     device: str = "cpu",
 ):
-    hlog(
+    logger.info(
         f"Loading BLEURT with model_size={model_size}, seq_len={seq_len}, batch_size={batch_size}, and device={device}..."
     )
     name = f"bleurt_{model_size}"
@@ -433,7 +435,7 @@ def compute(
         formatted_docs: list[Doc],
         **kwargs,
     ) -> dict[str, float]:
-        hlog(f"Scoring {len(formatted_docs)} samples with {self.metric_name}...")
+        logger.info(f"Scoring {len(formatted_docs)} samples with {self.metric_name}...")
         golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
         predictions = [response[0].result[0] for response in responses]
         sources = [kwargs["formatted_doc"].specific["source"] for kwargs["formatted_doc"] in formatted_docs]
@@ -455,7 +457,9 @@ def get_comet(
     gpus: int = 1,
     device: str = "cpu",
 ):
-    hlog(f"Loading COMET with model_name={model_name}, batch_size={batch_size}, gpus={gpus}, and device={device}...")
+    logger.info(
+        f"Loading COMET with model_name={model_name}, batch_size={batch_size}, gpus={gpus}, and device={device}..."
+    )
     name = model_name.split("/")[-1]
     return SampleLevelMetricGrouping(
         metric_name=[name],

From 2d4c0ed803c719f306cedb9892d1a01440855b9f Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Mon, 9 Dec 2024 15:41:34 +0100
Subject: [PATCH 30/43] Updated stop_sequence.

---
 community_tasks/swiss_legal_evals.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index f5ae79953..16177c2e3 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -655,14 +655,14 @@ def __post_init__(self):
             text_col_name="bgeText",
             metadata_cols=["bge"],
             generation_size=2048,
-            stop_sequence=["</s>", "\n\n"],
+            stop_sequence=["</s>", ".\n\n", "\n\n"],
         ),
         "regeste_level": LevelConfig(
             name="regeste_level",
             text_col_name="regesteText",
             metadata_cols=["bge"],
             generation_size=512,
-            stop_sequence=["</s>", "\n\n"],
+            stop_sequence=["</s>", ".\n\n", "\n\n"],
         ),
         "text_level": LevelConfig(
             name="text_level",
@@ -685,14 +685,14 @@ def __post_init__(self):
             text_col_name="lawText",
             metadata_cols=["rsNr"],
             generation_size=16384,
-            stop_sequence=["</s>", "\n\n"],
+            stop_sequence=["</s>", ".\n\n", "\n\n"],
         ),
         "article_level": LevelConfig(
             name="article_level",
             text_col_name="artText",
             metadata_cols=["rsNr"],
             generation_size=1024,
-            stop_sequence=["</s>", "\n\n"],
+            stop_sequence=["</s>", ".\n\n", "\n\n"],
         ),
         "paragraph_level": LevelConfig(
             name="paragraph_level",
@@ -715,7 +715,7 @@ def __post_init__(self):
             text_col_name="text",
             metadata_cols=["filename"],
             generation_size=1024,
-            stop_sequence=["</s>", "\n\n"],
+            stop_sequence=["</s>", ".\n\n", "\n\n"],
         )
     },
 )

From 7b7797279b6f0355e4106bad0c507eb256128962 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Tue, 10 Dec 2024 11:35:15 +0100
Subject: [PATCH 31/43] Made metric selection easier.

---
 community_tasks/swiss_legal_evals.py | 189 +++++++++++++++------------
 1 file changed, 108 insertions(+), 81 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 16177c2e3..beb738048 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -213,9 +213,6 @@ def get_swiss_legal_translation_judge(judge_model_name: str = "gpt-4o"):
     )
 
 
-swiss_legal_translation_judge_gpt_4o = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
-
-
 class GEMBA:
     def __init__(self, method: str = "GEMBA-MQM_norm", model: str = "gpt-4o"):
         self.method = method
@@ -259,9 +256,6 @@ def get_gemba_judge(method: str = "GEMBA-MQM_norm", model: str = "gpt-4o"):
     )
 
 
-gemba_mqm_gpt_4o = get_gemba_judge(method="GEMBA-MQM_norm", model="gpt-4o")
-
-
 def get_bert_score(language: str, num_layers: int = 24, model_type: str = "xlm-roberta-large", device: str = "cpu"):
     if device == "mps":
         raise ValueError("MPS is not supported for BERTScore")
@@ -316,19 +310,6 @@ def get_bert_score(language: str, num_layers: int = 24, model_type: str = "xlm-r
     )
 
 
-# Create BERTScore metrics for each language
-
-
-bert_scores = {
-    lang: get_bert_score(
-        language=lang,
-        model_type="xlm-roberta-large",
-        device=device,
-    )
-    for lang in ["de", "fr", "it", "rm", "en"]
-}
-
-
 class BLEURT:
     def __init__(
         self,
@@ -408,9 +389,6 @@ def get_bleurt(
     )
 
 
-bleurt_large = get_bleurt(model_size="large", seq_len=512, batch_size=256, device=device)
-
-
 class COMET:
     def __init__(
         self,
@@ -476,12 +454,6 @@ def get_comet(
     )
 
 
-# There are also reference-free models (e.g., Unbabel/wmt22-cometkiwi-da), but since we have reference gold labels, we use the reference-based models.
-# comet_wmt22_da = get_comet(model_name="Unbabel/wmt22-comet-da", batch_size=64, gpus=1, device=device)
-# xcomet_xl = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=32, gpus=1, device=device)
-xcomet_xxl = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=16, gpus=1, device=device)
-
-
 class METEOR:
     def __init__(self, alpha=0.9, beta=3, gamma=0.5):
         self.alpha = alpha
@@ -520,14 +492,15 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         return statistics.mean(scores) * 100
 
 
-meteor = SampleLevelMetric(
-    metric_name="meteor",
-    higher_is_better=True,
-    category=MetricCategory.GENERATIVE,
-    use_case=MetricUseCase.TRANSLATION,
-    sample_level_fn=METEOR().compute,
-    corpus_level_fn=statistics.mean,
-)
+def get_meteor():
+    return SampleLevelMetric(
+        metric_name="meteor",
+        higher_is_better=True,
+        category=MetricCategory.GENERATIVE,
+        use_case=MetricUseCase.TRANSLATION,
+        sample_level_fn=METEOR().compute,
+        corpus_level_fn=statistics.mean,
+    )
 
 
 class BLEU:
@@ -549,14 +522,15 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         return statistics.mean(scores) * 100
 
 
-bleu_sentence = SampleLevelMetric(
-    metric_name="bleu_sentence",
-    higher_is_better=True,
-    category=MetricCategory.GENERATIVE,
-    use_case=MetricUseCase.TRANSLATION,
-    sample_level_fn=BLEU().compute,
-    corpus_level_fn=statistics.mean,
-)
+def get_bleu_sentence():
+    return SampleLevelMetric(
+        metric_name="bleu_sentence",
+        higher_is_better=True,
+        category=MetricCategory.GENERATIVE,
+        use_case=MetricUseCase.TRANSLATION,
+        sample_level_fn=BLEU().compute,
+        corpus_level_fn=statistics.mean,
+    )
 
 
 class CHRF:
@@ -571,14 +545,15 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         return statistics.mean(scores) * 100
 
 
-chrf_sentence = SampleLevelMetric(
-    metric_name="chrf_sentence",
-    higher_is_better=True,
-    category=MetricCategory.GENERATIVE,
-    use_case=MetricUseCase.TRANSLATION,
-    sample_level_fn=CHRF().compute,
-    corpus_level_fn=statistics.mean,
-)
+def get_chrf_sentence():
+    return SampleLevelMetric(
+        metric_name="chrf_sentence",
+        higher_is_better=True,
+        category=MetricCategory.GENERATIVE,
+        use_case=MetricUseCase.TRANSLATION,
+        sample_level_fn=CHRF().compute,
+        corpus_level_fn=statistics.mean,
+    )
 
 
 class TER:
@@ -593,14 +568,15 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float:
         return statistics.mean(scores) * 100
 
 
-ter_sentence = SampleLevelMetric(
-    metric_name="ter_sentence",
-    higher_is_better=False,
-    category=MetricCategory.GENERATIVE,
-    use_case=MetricUseCase.TRANSLATION,
-    sample_level_fn=TER().compute,
-    corpus_level_fn=statistics.mean,
-)
+def get_ter_sentence():
+    return SampleLevelMetric(
+        metric_name="ter_sentence",
+        higher_is_better=False,
+        category=MetricCategory.GENERATIVE,
+        use_case=MetricUseCase.TRANSLATION,
+        sample_level_fn=TER().compute,
+        corpus_level_fn=statistics.mean,
+    )
 
 
 # EVALS WITH SUBSET
@@ -750,6 +726,77 @@ def prompt_fn(line: dict, task_name: str = None):
     return prompt_fn
 
 
+METRICS_TO_USE = [
+    "bleu",
+    "chrf",
+    "bleu_sentence",
+    "chrf_sentence",
+    "ter_sentence",
+    "meteor",
+    "bert_score",
+    "bleurt_large",
+    "xcomet_xxl",
+    "gemba_mqm_gpt_4o",
+    "slt_judge_gpt_4o",
+]
+METRICS = {}
+
+# ===== Lexical metrics =====
+# Corpus level metrics
+if "bleu" in METRICS_TO_USE:
+    METRICS["bleu"] = Metrics.bleu
+if "chrf" in METRICS_TO_USE:
+    METRICS["chrf"] = Metrics.chrf
+if "ter" in METRICS_TO_USE:
+    # TER often hangs for a while and takes more than 10 minutes to compute
+    METRICS["ter"] = Metrics.ter
+# Sample level metrics
+if "bleu_sentence" in METRICS_TO_USE:
+    METRICS["bleu_sentence"] = get_bleu_sentence()
+if "chrf_sentence" in METRICS_TO_USE:
+    METRICS["chrf_sentence"] = get_chrf_sentence()
+if "ter_sentence" in METRICS_TO_USE:
+    METRICS["ter_sentence"] = get_ter_sentence()
+if "meteor" in METRICS_TO_USE:
+    METRICS["meteor"] = get_meteor()
+# ===== Model-based metrics =====
+if "bert_score" in METRICS_TO_USE:
+    METRICS["bert_score"] = {  # Create BERTScore metrics for each language
+        lang: get_bert_score(language=lang, model_type="xlm-roberta-large", device=device)
+        for lang in ["de", "fr", "it", "rm", "en"]
+    }
+if "bleurt_tiny" in METRICS_TO_USE:
+    METRICS["bleurt_tiny"] = get_bleurt(model_size="tiny", seq_len=512, batch_size=256, device=device)
+if "bleurt_base" in METRICS_TO_USE:
+    METRICS["bleurt_base"] = get_bleurt(model_size="base", seq_len=512, batch_size=256, device=device)
+if "bleurt_large" in METRICS_TO_USE:
+    METRICS["bleurt_large"] = get_bleurt(model_size="large", seq_len=512, batch_size=256, device=device)
+# There are also reference-free models (e.g., Unbabel/wmt22-cometkiwi-da), but since we have reference gold labels, we use the reference-based models.
+if "wmt22-comet-da" in METRICS_TO_USE:
+    METRICS["wmt22-comet-da"] = get_comet(model_name="Unbabel/wmt22-comet-da", batch_size=64, gpus=1, device=device)
+if "xcomet_xl" in METRICS_TO_USE:
+    METRICS["xcomet_xl"] = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=32, gpus=1, device=device)
+if "xcomet_xxl" in METRICS_TO_USE:
+    METRICS["xcomet_xxl"] = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=16, gpus=1, device=device)
+if "gemba_mqm_gpt_4o" in METRICS_TO_USE:
+    METRICS["gemba_mqm_gpt_4o"] = get_gemba_judge(method="GEMBA-MQM_norm", model="gpt-4o")
+if "slt_judge_gpt_4o" in METRICS_TO_USE:
+    METRICS["slt_judge_gpt_4o"] = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
+# Additionally we could consider adding the following open source judge models:
+# flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0
+# However, these are only fine-tuned on English data and we need multilingual support.
+
+
+def get_metrics(METRICS_TO_USE, target_lang: str):
+    metrics = []
+    for metric in METRICS_TO_USE:
+        if metric in METRICS:
+            metrics.append(METRICS[metric])
+        elif metric == "bert_score":
+            metrics.append(METRICS["bert_score"][target_lang])
+    return metrics
+
+
 class TranslationTask(LightevalTaskConfig):
     def __init__(
         self,
@@ -771,27 +818,7 @@ def __init__(
             few_shots_split="validation",
             few_shots_select="sequential",
             generation_size=level_config.generation_size,
-            metric=[
-                # ===== Lexical metrics =====
-                # Corpus level metrics
-                Metrics.bleu,
-                Metrics.chrf,
-                # Metrics.ter,  # TER often hangs for a while and takes more than 10 minutes to compute
-                # Sample level metrics
-                bleu_sentence,
-                chrf_sentence,
-                ter_sentence,
-                meteor,
-                # ===== Model-based metrics =====
-                bert_scores[target_lang],
-                bleurt_large,  # Only take the largest version, disregarding base and tiny
-                xcomet_xxl,  # Only take the largest version, disregarding xcomet_xl, comet_wmt22_da
-                gemba_mqm_gpt_4o,
-                swiss_legal_translation_judge_gpt_4o,
-                # Additionally we could consider adding the following open source judge models:
-                # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0
-                # However, these are only fine-tuned on English data and we need multilingual support.
-            ],
+            metric=get_metrics(METRICS_TO_USE, target_lang),
             stop_sequence=level_config.stop_sequence,
             trust_dataset=True,
             # Remove the target language in the beginning if it exists: e.g., FR: {translation}

From fcd9505237a5054ed6a83bf5aa25bd00e05189dd Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Tue, 10 Dec 2024 18:50:10 +0100
Subject: [PATCH 32/43] Fixed dict issue.

---
 community_tasks/swiss_legal_evals.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index beb738048..9565b8ca3 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -736,7 +736,6 @@ def prompt_fn(line: dict, task_name: str = None):
     "bert_score",
     "bleurt_large",
     "xcomet_xxl",
-    "gemba_mqm_gpt_4o",
     "slt_judge_gpt_4o",
 ]
 METRICS = {}
@@ -778,7 +777,7 @@ def prompt_fn(line: dict, task_name: str = None):
     METRICS["xcomet_xl"] = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=32, gpus=1, device=device)
 if "xcomet_xxl" in METRICS_TO_USE:
     METRICS["xcomet_xxl"] = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=16, gpus=1, device=device)
-if "gemba_mqm_gpt_4o" in METRICS_TO_USE:
+if "gemba_mqm_gpt_4o" in METRICS_TO_USE:  # TODO: Somehow in long evaluations something is wrong here
     METRICS["gemba_mqm_gpt_4o"] = get_gemba_judge(method="GEMBA-MQM_norm", model="gpt-4o")
 if "slt_judge_gpt_4o" in METRICS_TO_USE:
     METRICS["slt_judge_gpt_4o"] = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
@@ -791,9 +790,11 @@ def get_metrics(METRICS_TO_USE, target_lang: str):
     metrics = []
     for metric in METRICS_TO_USE:
         if metric in METRICS:
-            metrics.append(METRICS[metric])
-        elif metric == "bert_score":
-            metrics.append(METRICS["bert_score"][target_lang])
+            if metric == "bert_score":
+                # Add only the BERTScore for the target language
+                metrics.append(METRICS["bert_score"][target_lang])
+            else:
+                metrics.append(METRICS[metric])
     return metrics
 
 

From 5a8ca464a0963edf9fd32af481ccd15870b69aac Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Wed, 11 Dec 2024 15:15:36 +0100
Subject: [PATCH 33/43] Added metric dependencies.

---
 pyproject.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 9a4d3a3ce..9aa1d76ef 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -74,6 +74,8 @@ dependencies = [
     "scikit-learn",
     "spacy==3.7.2",
     "sacrebleu",
+    "unbabel-comet",
+    "gemba",
     "rouge_score==0.1.2",
     "sentencepiece>=0.1.99",
     "protobuf==3.20.*", # pinned for sentencepiece compat

From bab94af49a75d57b25498fe864dae7ecb2f46562 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Wed, 11 Dec 2024 15:16:54 +0100
Subject: [PATCH 34/43] Moving metrics to extended tasks.

---
 pyproject.toml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9aa1d76ef..e1a2a772e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -74,8 +74,6 @@ dependencies = [
     "scikit-learn",
     "spacy==3.7.2",
     "sacrebleu",
-    "unbabel-comet",
-    "gemba",
     "rouge_score==0.1.2",
     "sentencepiece>=0.1.99",
     "protobuf==3.20.*", # pinned for sentencepiece compat
@@ -101,7 +99,9 @@ docs = ["hf-doc-builder", "watchdog"]
 extended_tasks = [
   "langdetect", # ifeval
   "openai", # llm as a judge using openai models
-  "tiktoken"
+  "tiktoken",
+  "unbabel-comet",
+  "gemba",
 ]
 s3 = ["s3fs"]
 multilingual = [

From 09be56d87c662b6a6cec33bd2c5bd1d0fbeb0d5d Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Sun, 22 Dec 2024 16:43:58 +0100
Subject: [PATCH 35/43] Added support for judges from different providers.

---
 community_tasks/swiss_legal_evals.py | 36 +++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 9565b8ca3..17fdf87b9 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -182,7 +182,7 @@ def compute(
         questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]
         options = [formatted_doc.choices for formatted_doc in formatted_docs]
         golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
-        predictions = [response[0].result[0] for response in responses]
+        predictions = [response[0].result for response in responses]
 
         scores, _, judgements = self.judge.evaluate_answer_batch(questions, predictions, options, golds)
         # Exclude the messages (user prompt) because they are too long
@@ -195,21 +195,24 @@ def compute(
         ]
 
 
-def get_swiss_legal_translation_judge(judge_model_name: str = "gpt-4o"):
-    name = f"slt_judge_{judge_model_name}"
+def get_swiss_legal_translation_judge(
+    judge_model_name: str = "openai/gpt-4o-2024-11-20",
+    short_judge_name: str = "slt_judge_gpt-4o",
+    backend: str = "litellm",
+):
     return SampleLevelMetricGrouping(
-        metric_name=[name],
-        higher_is_better={name: True},
+        metric_name=[short_judge_name],
+        higher_is_better={short_judge_name: True},
         category=MetricCategory.LLM_AS_JUDGE,
         use_case=MetricUseCase.TRANSLATION,
         sample_level_fn=JudgeSwissLegalTranslation(
             judge_model_name=judge_model_name,
             template=swiss_legal_translation_judge,
             process_judge_response=process_judge_response_freeform_gpt,
-            judge_backend="openai",
-            short_judge_name=name,
+            judge_backend=backend,
+            short_judge_name=short_judge_name,
         ).compute,
-        corpus_level_fn={name: statistics.mean},
+        corpus_level_fn={short_judge_name: statistics.mean},
     )
 
 
@@ -779,8 +782,23 @@ def prompt_fn(line: dict, task_name: str = None):
     METRICS["xcomet_xxl"] = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=16, gpus=1, device=device)
 if "gemba_mqm_gpt_4o" in METRICS_TO_USE:  # TODO: Somehow in long evaluations something is wrong here
     METRICS["gemba_mqm_gpt_4o"] = get_gemba_judge(method="GEMBA-MQM_norm", model="gpt-4o")
+if "slt_judge_gpt_4o_mini" in METRICS_TO_USE:
+    METRICS["slt_judge_gpt_4o_mini"] = get_swiss_legal_translation_judge(
+        judge_model_name="openai/gpt-4o-mini-2024-07-18", short_judge_name="slt_judge_gpt-4o-mini"
+    )
 if "slt_judge_gpt_4o" in METRICS_TO_USE:
-    METRICS["slt_judge_gpt_4o"] = get_swiss_legal_translation_judge(judge_model_name="gpt-4o")
+    METRICS["slt_judge_gpt_4o"] = get_swiss_legal_translation_judge(
+        judge_model_name="openai/gpt-4o-2024-11-20", short_judge_name="slt_judge_gpt-4o"
+    )
+if "slt_judge_haiku_35" in METRICS_TO_USE:
+    METRICS["slt_judge_haiku_35"] = get_swiss_legal_translation_judge(
+        judge_model_name="anthropic/claude-3-5-haiku-20241022", short_judge_name="slt_judge_haiku-3.5"
+    )
+if "slt_judge_sonnet_35" in METRICS_TO_USE:
+    METRICS["slt_judge_sonnet_35"] = get_swiss_legal_translation_judge(
+        judge_model_name="anthropic/claude-3-5-sonnet-20241022", short_judge_name="slt_judge_sonnet-3.5"
+    )
+
 # Additionally we could consider adding the following open source judge models:
 # flowaicom/Flow-Judge-v0.1, prometheus-eval/prometheus-7b-v2.0
 # However, these are only fine-tuned on English data and we need multilingual support.

From 0aa860777f85aa39eb85ee28b88054084fe9ca55 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Sun, 22 Dec 2024 17:18:16 +0100
Subject: [PATCH 36/43] Added additional system and user prompts and few shot
 examples.

---
 community_tasks/swiss_legal_evals.py | 162 +++++++++++++++++++++++----
 1 file changed, 140 insertions(+), 22 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 17fdf87b9..bf43c31bf 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -79,18 +79,56 @@
 
 # CUSTOM METRICS
 
-
-def swiss_legal_translation_judge(question, options, answer, gold):
-    return [
-        {
-            "role": "system",
-            "content": "Act as a Judge specializing in the evaluation of translations of Swiss legal documents. Your task is to assess the accuracy, clarity, and fidelity of the model's translation to the golden translation, while considering the nuances of legal language.",
-        },
-        {
-            "role": "user",
-            "content": f"""You will be provided with a source text, its golden translation, and the model's translation. Your task is to judge how correct the model's translation is based on the golden translation, and then give a correctness score. The correctness score should be one of the below numbers: 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). You should first briefly give your reasoning process regarding how the model's translation conforms to or contradicts the golden translation, and then give the correctness score. The correctness score must strictly follow this format: \"[[score]]\", e.g., \"The correctness score: [[0.5]]\". Below are some examples.
-
-Example 1:
+SYSTEM = {
+    "basic": "Act as a Judge specializing in the evaluation of translations of Swiss legal documents. Your task is to assess the accuracy, clarity, and fidelity of the model's translation to the golden translation, while considering the nuances of legal language.",
+    "detailed": "You are a senior legal translator and quality assurance specialist with over 20 years of experience in Swiss law, certified by the Swiss Sworn Translators Association (Association suisse des traducteurs-jurés, ASTJ). You possess native-level proficiency in all Swiss national languages (German, French, Italian, and Romansh) as well as English, enabling precise evaluation of legal nuances across all linguistic combinations. Your task is to evaluate machine-translated legal texts for accuracy, clarity and fidelity to Swiss legal standards analyzing the subtle complexities of legal language. You excel at identifying even minor discrepancies and calibrating evaluation scores appropriately to reflect the severity of each error.",
+}
+
+USER = {
+    "basic": """
+You will be provided with a source text, its golden translation, and the model's translation. Your task is to judge how correct the model's translation is based on the golden translation, and then give a correctness score. The correctness score should be one of the below numbers: 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). You should first briefly give your reasoning process regarding how the model's translation conforms to or contradicts the golden translation, and then give the correctness score. The correctness score must strictly follow this format: \"[[score]]\", e.g., \"The correctness score: [[0.5]]\". Below are some examples.
+""",
+    "detailed": """
+INPUT FORMAT:
+Source Text: [Original text in source language]
+Golden Translation: [Reference professional translation]
+Model Translation: [Machine-generated translation to be evaluated]
+
+
+EVALUATION DIMENSIONS:
+Accuracy: Semantic equivalence, correct legal terminology, and preservation of legal meaning.
+Clarity: Logical flow, appropriate legal register, and unambiguous expression.
+Fidelity: Adherence to Swiss legal conventions, jurisdiction-specific terminology, and formal register.
+
+
+SCORING RUBRIC:
+1.0: Perfect translation
+0.7-0.9: Minor issues only
+0.4-0.6: Significant but non-critical errors
+0.1-0.3: Major errors affecting legal meaning
+0.0: Completely incorrect
+
+
+REQUIRED OUTPUT FORMAT:
+Reasoning: Analyze how the model's translation aligns with or differs from the golden translation, focusing on significant legal and linguistic aspects.
+Examples: Identify specific terms, phrases, or sections in the model's answer that were correct or incorrect, with explanations.
+Score: End with "The correctness score: [[score]]"
+
+
+EVALUATION GUIDELINES:
+Stylistic differences should not impact accuracy significantly unless they alter the legal meaning.
+Untranslated Latin terms (e.g., prima facie) are not considered errors, but they should still be assessed for appropriate use within the context of the answer.
+Terminology should be used consistently throughout the text.
+Consider both explicit and implicit legal meanings.
+Consider jurisdiction-specific legal terminology.
+Flag any ambiguities, omissions or additions that affect legal meaning.
+
+Below are some examples.
+""",
+}
+
+FEW_SHOT = {
+    "diverse": """Example 1:
 Source Text:
 ```A contract is void if its terms are impossible, unlawful or immoral. However, where the defect pertains only to certain terms of a contract, those terms alone are void unless there is cause to assume that the contract would not have been concluded without them.```
 
@@ -100,7 +138,8 @@ def swiss_legal_translation_judge(question, options, answer, gold):
 Model’s Translation:
 ```Il contratto è nullo se le sue clausole sono impossibili, illecite o immorali. Tuttavia, quando il vizio riguarda solo determinate clausole del contratto, solo queste sono nulle, salvo che vi sia motivo di ritenere che il contratto non sarebbe stato concluso senza di esse.```
 
-Your Judgment: The model’s translation aligns well with the golden translation in terms of accuracy, clarity, and fidelity to the source text. However, there are minor stylistic differences. For example, the golden translation uses “conchiuso,” an older and more formal term, while the model opts for “concluso,” which is modern. Similarly, the golden translation uses the idiomatic phrase “contraria alle leggi od ai buoni costumi,” whereas the model employs the more literal “illecite o immorali”. The correctness score: [[0.9]]
+Your Judgment: The model’s translation aligns well with the golden translation in terms of accuracy, clarity, and fidelity to the source text. However, there are minor stylistic differences. For example, the golden translation uses “conchiuso” an older and more formal term, while the model opts for “concluso” which is modern. Similarly, the golden translation uses the idiomatic phrase “contraria alle leggi od ai buoni costumi” whereas the model employs the more literal “illecite o immorali”. The correctness score: [[0.9]]
+
 
 Example 2:
 Source Text:
@@ -120,6 +159,7 @@ def swiss_legal_translation_judge(question, options, answer, gold):
 
 Your Judgment: The model’s translation mostly aligns with the golden translation but diverges when it comes to accuracy and fidelity to Swiss legal terminology. For instance, the term “exploitabilité” which is closer to the Swiss provision is replaced in the model’s translation with “admissibilité”. Similarly, “ingérence” is used instead of “atteinte”, although “atteinte” is commonly used in Swiss law to discuss a violation of fundamental rights. Also, the term "recherche automatisée de véhicules et surveillance du trafic (RVS)" used by the golden translation is more established than "poursuite automatisée des véhicules et de la surveillance de la circulation (AFV)" in the model’s translation. The model’s translation is almost complete, but omits a critical point in one sentence: that the evidence was unlawfully obtained due to lack of a sufficiently clear legal basis. This omission impacts the completeness. The correctness score: [[0.7]]
 
+
 Example 3:
 Source Text:
 ```Yoko Ono est propriétaire de la montre de John Lennon – rejet du recours d'un collectionneur contre un arrêt rendu par la Cour de justice genevoise
@@ -139,7 +179,6 @@ def swiss_legal_translation_judge(question, options, answer, gold):
 
 Das Bundesgericht weist die dagegen erhobene Beschwerde des Sammlers ab. Unbestritten ist zunächst, dass das Eigentum an der Uhr nach dem Tod von John Lennon durch Erbschaft an Yoko Ono übergegangen ist. Sodann durfte das Genfer Kantonsgericht willkürfrei davon ausgehen, dass die Uhr vom ehemaligen Chauffeur gestohlen wurde und umgekehrt nichts dafür spricht, dass Yoko Ono dem Chauffeur dieses einzigartige Stück, welches sie John Lennon zwei Monate vor seinem Tod mit einer Inschrift versehen überreicht hatte, hätte schenken wollen. Da es sich bei der Uhr um eine gestohlene Sache handelt, konnte der Sammler und heutige Beschwerdeführer bei ihrem Erwerb 2014 in Deutschland nicht unmittelbar zum Eigentümer werden; dies gilt gemäss dem diesbezüglich anwendbaren deutschen Recht unabhängig davon, ob der Erwerber in Bezug auf die Herkunft der Sache in gutem Glauben war.```
 
-
 Model’s Translation:
 ```Hier, der Bundesgerichtshof hat den Antrag des Käufer gegen den Urteil des Zivilgerichts in Zürich abgelehnt, das das Zivilgericht in Zürich 2023 bestätigt hatte, dass Yoko Ono die Uhr als Geschenk 1980 an John Lennon gegeben hat, bevor dieser am 8. Dezember 1980 ermordet wurde. Der Käufer, der die Uhr 2014 in eine Geneßische Auktion verkauft hat, behauptet, die Uhr sei ihm verliehen worden, und nicht von Yoko Ono.
 
@@ -152,22 +191,72 @@ def swiss_legal_translation_judge(question, options, answer, gold):
 Der Bundesgerichtshof hat den Antrag des Käufer gegen dieses Urteil abgelehnt. Er ist zunächst nicht bezweifelt, dass Yoko Ono die Uhr nach dem Tod von John Lennon durch Erbvergabe gekauft hat. Es ist auch nicht bezweifelt, dass die Zivilgerichtsverfassung in Zürich die Uhr als gestohlen angenommen hat, da niemand einen Beweis dafür gab, dass Yoko Ono die Uhr als Geschenk an den Wagenkellner hatte. Deshalb kann der Käufer nicht durch eine rechtliche Handlung die Uhr von Yoko Ono erwerben, wenn er die Uhr in Deutschland gekauft hat.```
 
 Your Judgment: The model’s translation diverges significantly from the golden translation in accuracy, clarity, and fidelity. Critical legal terminology is mistranslated, omitted, and distorted. For instance, the courts are misidentified (“Zivilgerichtsverfassung”, “Zivilgericht”, “Bundesgerichtshof”). The model’s translation has several grammatical errors, such as “Geneßische Auktion”, “Erbvergabe”, “Wagenkellner” and “zu valieren”. The model also omits the explanation that, under German law, stolen property cannot be acquired in good faith. The correctness score: [[0.2]]
+""",
+    "fr-de": """Example 1:
+Source Text:
+```Le contrat est nul s’il a pour objet une chose impossible, illicite ou contraire aux moeurs. Si le contrat n’est vicié que dans certaines de ses clauses, ces clauses sont seules frappées de nullité, à moins qu’il n’y ait lieu d’admettre que le contrat n’aurait pas été conclu sans elles.```
 
-Judge the below case, give the brief reasoning process and the correctness score.
+Golden Translation:
+```Ein Vertrag, der einen unmöglichen oder widerrechtlichen Inhalt hat oder gegen die guten Sitten verstösst, ist nichtig. Betrifft aber der Mangel bloss einzelne Teile des Vertrages, so sind nur diese nichtig, sobald nicht anzunehmen ist, dass er ohne den nichtigen Teil überhaupt nicht geschlossen worden wäre.```
+
+Model’s Translation:
+```Der Vertrag ist nichtig, wenn er einen unmöglichen, widerrechtlichen oder sittenwidrigen Inhalt hat. Betrifft der Mangel bloß einzelne Teile des Vertrages, so sind nur diese nichtig, sobald nicht anzunehmen ist, dass er ohne den nichtigen Teil überhaupt nicht geschlossen worden wäre.```
 
+Your Judgment: The model’s translation aligns well with the golden translation in terms of accuracy and clarity. However, minor stylistic differences exist. For example, the golden translation uses “gegen die guten Sitten verstösst” which is more idiomatic, while the model opts for the more literal “sittenwidrigen Inhalt hat” Similarly, “Ein Vertrag” in the golden translation better reflects generalized legal phrasing than the model’s literal “Der Vertrag”. The correctness score: [[0.9]]
 
+
+Example 2:
 Source Text:
-```{question}```
+```Art. 13 al. 2, art. 36 al. 1 Cst.; art. 141 al. 2 CPP; exploitabilité d'enregistrements réalisés par la police au moyen du Système de recherche automatisée de véhicules et surveillance du trafic (RVS).
+La réalisation et le stockage d'enregistrements par la RVS constituent une atteinte aux droits fondamentaux des personnes concernées, singulièrement à la protection de leur sphère privée, qui englobe le droit à l'autodétermination en matière de données (consid. 3.1). Le recours à la RVS ne repose sur aucune base légale suffisamment précise dans le canton de Thurgovie. L'atteinte à la sphère privée inhérente à cette surveillance viole partant l'art. 13 al. 2 en corrélation avec l'art. 36 al. 1 Cst. (consid. 3.2 et 3.3).
+La constatation d'infractions par la police dans le cadre de son activité de contrôle préventive, relève de l'investigation au sens des art. 306 ss CPP. L'exploitabilité dans le procès pénal de preuves recueillies de manière illicite faute de base légale doit être examinée à l'aune de l'art. 141 al. 2 CPP (précision de la jurisprudence; consid. 4.1 et 4.2). Exploitabilité niée en l'espèce (consid. 4.3).```
 
 Golden Translation:
-```{gold}```
+```Art. 13 Abs. 2, Art. 36 Abs. 1 BV; Art. 141 Abs. 2 StPO; Verwertbarkeit von polizeilichen Aufzeichnungen der automatischen Fahrzeugfahndung und Verkehrsüberwachung (AFV).
+Die Erhebung und die Aufbewahrung von Aufzeichnungen der AFV stellen einen Eingriff in die Grundrechte der Betroffenen dar, insbesondere in das Recht auf Privatsphäre, das den Anspruch auf informationelle Selbstbestimmung miteinschliesst (E. 3.1). Für die AFV besteht im Kanton Thurgau keine hinreichend bestimmte gesetzliche Grundlage. Der mit der Überwachung verbundene Eingriff in die Privatsphäre verstösst daher gegen Art. 13 Abs. 2 i.V.m. Art. 36 Abs. 1 BV (E. 3.2 und 3.3).
+Stellt die Polizei im Rahmen ihrer präventiven Kontrolltätigkeit strafbare Handlungen fest, ermittelt sie nach Art. 306 ff. StPO. Die Frage, ob die mangels gesetzlicher Grundlage rechtswidrig erhobenen Beweismittel im Strafprozess verwertbar sind, ist nach Art. 141 Abs. 2 StPO zu prüfen (Präzisierung der Rechtsprechung; E. 4.1 und 4.2). Verwertbarkeit im vorliegenden Fall verneint (E. 4.3).```
 
-Model's Translation:
-```{answer}```
+Model’s Translation:
+```Art. 13 Abs. 2, Art. 36 Abs. 1 BV; Art. 141 Abs. 2 StPO; Verwertbarkeit von Aufzeichnungen, die durch die Polizei mit dem System zur automatisierten Fahrzeugerfassung und Verkehrsüberwachung (RVS) durchgeführt wurden.
+Die Durchführung und Speicherung von Aufzeichnungen durch das RVS stellt einen Eingriff in die Grundrechte der betroffenen Personen dar, insbesondere in den Schutz ihrer Privatsphäre, die das Recht auf Selbstbestimmung in Bezug auf Daten umfasst (E. 3.1). Der Einsatz von RVS stützt sich im Kanton Thurgau auf keine ausreichend präzise gesetzliche Grundlage. Die in dieser Überwachung liegende Beeinträchtigung der Privatsphäre verstößt daher gegen Art. 13 Abs. 2 in Verbindung mit Art. 36 Abs. 1 BV (E. 3.2 und 3.3).
+Die Aufdeckung von Straftaten durch die Polizei im Rahmen ihrer präventiven Kontrolltätigkeit fällt unter die Ermittlung im Sinne der Art. 306 ff. StPO. Die Verwertbarkeit im Strafverfahren von Beweisen, die auf illegale Weise erhoben wurden, muss anhand von Art. 141 Abs. 2 StPO geprüft werden (Klarstellung der Rechtsprechung; E. 4.1 und 4.2). Verwertbarkeit in diesem Fall verneint (E. 4.3).```
 
-Your Judgment:""",
-        },
-    ]
+Your Judgment: The model’s translation is accurate overall but omits a critical point in the second-to-last sentence: the evidence was unlawfully obtained due to a lack of legal basis. Additionally, its fidelity to Swiss legal terminology is limited. For example, the model uses "Klarstellung der Rechtsprechung" instead of the more appropriate "Präzisierung der Rechtsprechung" and "nicht ausreichend präzise" rather than the common "hinreichend bestimmt" It also consistently uses the French abbreviation "RVS" instead of the German "automatische Fahrzeugfahndung und Verkehrsüberwachung (AFV)" Lastly, "Recht auf Selbstbestimmung in Bezug auf Daten" is overly literal compared to the idiomatic "Anspruch auf informationelle Selbstbestimmung". The correctness score: [[0.6]]
+
+
+Example 3:
+Source Text:
+```Yoko Ono est propriétaire de la montre de John Lennon – rejet du recours d'un collectionneur contre un arrêt rendu par la Cour de justice genevoise
+
+Le Tribunal fédéral rejette le recours déposé par un collectionneur contre l'arrêt de la Cour de justice genevoise par lequel celle-ci confirmait que Yoko Ono est propriétaire de la montre qu'elle avait offerte à John Lennon en 1980, deux mois avant qu'il ne soit assassiné. Le collectionneur, qui a remis la montre à une maison de vente aux enchères genevoise en 2014 afin d'en faire estimer la valeur, a quant à lui revendiqué la propriété de ladite montre.
+
+En 1980, Yoko Ono a acquis à New York une montre de marque Patek Philippe. Elle y a fait graver au dos l'inscription « (JUST LIKE) STARTING OVER LOVE YOKO 10·9·1980 N.Y.C » et l'a offerte à son époux, John Lennon, le 9 octobre 1980 pour son 40e anniversaire. Le 8 décembre 1980, John Lennon a été assassiné à New York. La montre a été répertoriée dans l'inventaire successoral et conservée dans une pièce de l'appartement de Yoko Ono à New York. Par la suite, la montre s'est retrouvée aux mains d'un homme qui avait été le chauffeur privé de Yoko Ono de 1995 à 2006. Un autre possesseur intermédiaire l'a remise à une maison de vente aux enchères allemande, où elle a été acquise par un collectionneur en 2014. Ce dernier l'a remise la même année à une maison de vente aux enchères genevoise afin d'en faire estimer la valeur, ce dont a été informée Yoko Ono. Cette dernière n'avait jusqu'alors pas eu conscience du fait que la montre n'était plus en sa possession. En 2018, le collectionneur a formé à Genève une action visant à constater sa qualité de propriétaire, action à laquelle Yoko Ono s'est opposée. En 2022, le tribunal de première instance genevois a constaté que Yoko Ono était la seule et unique propriétaire de la montre, ce que la Cour de justice du canton de Genève, statuant sur appel du collectionneur, a confirmé en 2023.
+
+Le Tribunal fédéral rejette le recours déposé par le collectionneur contre cet arrêt. Il n'est tout d'abord pas contesté que la propriété de la montre a été acquise par succession par Yoko Ono après le décès de John Lennon. C'est en outre sans arbitraire que la Cour de justice genevoise a retenu que la montre avait été volée par l'ancien chauffeur et que, à l'inverse, aucun élément ne permettait de démontrer que Yoko Ono aurait eu l'intention de faire donation au chauffeur d'une chose si particulière que la montre, gravée d'une inscription, qu'elle avait offerte à John Lennon deux mois avant son décès. Dès lors qu'il s'agit d'une chose volée, le collectionneur, aujourd'hui recourant, ne pouvait pas acquérir la propriété de la montre par un mode originaire d'acquisition lorsqu'il l'a achetée en Allemagne en 2014 ; selon le droit allemand applicable en la matière, cela vaut indépendamment du fait que l'acquéreur était ou non de bonne foi quant à l'origine de la chose.```
+
+Golden Translation:
+```Yoko Ono ist Eigentümerin der Uhr von John Lennon – Beschwerde von Sammler gegen Genfer Urteil abgewiesen
+
+Das Bundesgericht weist die Beschwerde eines Sammlers gegen das Urteil des Genfer Kantonsgerichts ab, mit dem Yoko Ono als Eigentümerin der Uhr bestätigt wurde, die sie John Lennon 1980 zwei Monate vor seiner Ermordung geschenkt hat. Der Sammler hatte die Uhr 2014 zur Schätzung bei einem Auktionshaus in Genf eingereicht und seinerseits Eigentümerschaft an der Uhr geltend gemacht.
+
+Yoko Ono hatte 1980 in New York eine Uhr der Marke Patek Philippe gekauft. Sie liess auf der Rückseite die Gravur "(JUST LIKE) STARTING OVER LOVE YOKO 10·9·1980 N.Y.C" anbringen und schenkte sie ihrem Ehemann John Lennon am 9. Oktober 1980 zum 40. Geburtstag. Am 8. Dezember 1980 wurde John Lennon in New York ermordet. Die Uhr wurde ins Erbschaftsinventar aufgenommen und in einem Zimmer der Wohnung von Yoko Ono in New York aufbewahrt. Sie gelangte von dort in die Hände eines Mannes, der von 1995 bis 2006 Privatchauffeur von Yoko Ono gewesen war. Ein weiterer Zwischenbesitzer brachte die Uhr in ein deutsches Auktionshaus, wo sie 2014 von einem Sammler erworben wurde. Dieser reichte die Uhr im gleichen Jahr bei einem Auktionshaus in Genf zur Schätzung ihres Wertes ein. Davon erfuhr Yoko Ono, die bis dahin keine Kenntnis davon gehabt hatte, dass sich die Uhr nicht mehr in ihrem Besitz befand. Der Sammler erhob 2018 in Genf eine Klage auf Feststellung seiner Eigentümerschaft, der sich Yoko Ono widersetzte. Das erstinstanzliche Genfer Gericht stellte 2022 fest, dass Yoko Ono die alleinige Eigentümerin der Uhr sei, was auf Berufung des Sammlers 2023 vom Kantonsgericht des Kantons Genf bestätigt wurde.
+
+Das Bundesgericht weist die dagegen erhobene Beschwerde des Sammlers ab. Unbestritten ist zunächst, dass das Eigentum an der Uhr nach dem Tod von John Lennon durch Erbschaft an Yoko Ono übergegangen ist. Sodann durfte das Genfer Kantonsgericht willkürfrei davon ausgehen, dass die Uhr vom ehemaligen Chauffeur gestohlen wurde und umgekehrt nichts dafür spricht, dass Yoko Ono dem Chauffeur dieses einzigartige Stück, welches sie John Lennon zwei Monate vor seinem Tod mit einer Inschrift versehen überreicht hatte, hätte schenken wollen. Da es sich bei der Uhr um eine gestohlene Sache handelt, konnte der Sammler und heutige Beschwerdeführer bei ihrem Erwerb 2014 in Deutschland nicht unmittelbar zum Eigentümer werden; dies gilt gemäss dem diesbezüglich anwendbaren deutschen Recht unabhängig davon, ob der Erwerber in Bezug auf die Herkunft der Sache in gutem Glauben war.```
+
+Model’s Translation:
+```Hier, der Bundesgerichtshof hat den Antrag des Käufer gegen den Urteil des Zivilgerichts in Zürich abgelehnt, das das Zivilgericht in Zürich 2023 bestätigt hatte, dass Yoko Ono die Uhr als Geschenk 1980 an John Lennon gegeben hat, bevor dieser am 8. Dezember 1980 ermordet wurde. Der Käufer, der die Uhr 2014 in eine Geneßische Auktion verkauft hat, behauptet, die Uhr sei ihm verliehen worden, und nicht von Yoko Ono.
+
+Im Jahr 1980 hat Yoko Ono in New York eine Uhr von der Marke Patek Philippe gekauft. Sie hat auf der Rückseite der Uhr die Inschrift "JUST LIKE STARTING OVER LOVE YOKO 10·9·1980 N.Y.C." gemalt und sie an ihren Ehemann, John Lennon, am 9. Oktober 1980 als Geburtstag gegeben. Am 8. Dezember 1980 wurde Lennon ermordet in New York. Die Uhr wurde in den Erbeverzeichnis des Lennon-Mädchens aufbewahrt und in einem Zimmer in Yoko Onos New Yorker Wohnung.
+
+Danach wurde die Uhr von einem Mann, der als Wagenkellner von Yoko Ono zwischen 1995 und 2006 gearbeitet hat, in die Hände gebracht. Ein weiterer mittlerer Eigentümer hat sie 2014 in eine deutsche Auktion verkauft, wo sie einem Käufer in Zürich übergeben wurde. Dieser Käufer hat die Uhr 2014 in eine Geneßische Auktion verkauft, um sie zu valieren, und Yoko Ono war von diesem Moment an nicht mehr bewusst, dass die Uhr nicht mehr in ihrer Hand war.
+
+Im Jahr 2018 hat der Käufer in Zürich eine Anzeige gegen Yoko Ono eröffnet, um die Qualität des Eigentums der Uhr zu bestätigen, was Yoko Ono ansprach. Im Jahr 2022 hat der Zivilgericht in Zürich bestätigt, dass Yoko Ono die einzige und letzte Eigentümerin der Uhr ist, was der Zivilgerichtshof im Jahr 2023 bestätigt hat.
+
+Der Bundesgerichtshof hat den Antrag des Käufer gegen dieses Urteil abgelehnt. Er ist zunächst nicht bezweifelt, dass Yoko Ono die Uhr nach dem Tod von John Lennon durch Erbvergabe gekauft hat. Es ist auch nicht bezweifelt, dass die Zivilgerichtsverfassung in Zürich die Uhr als gestohlen angenommen hat, da niemand einen Beweis dafür gab, dass Yoko Ono die Uhr als Geschenk an den Wagenkellner hatte. Deshalb kann der Käufer nicht durch eine rechtliche Handlung die Uhr von Yoko Ono erwerben, wenn er die Uhr in Deutschland gekauft hat.```
+
+Your Judgment: The model’s translation diverges significantly from the golden translation in accuracy, clarity, and fidelity. Critical legal terminology is mistranslated, omitted, and distorted. For instance, the courts are misidentified (“Zivilgerichtsverfassung”, “Zivilgericht”, “Bundesgerichtshof”). The model’s translation has several grammatical errors, such as “Geneßische Auktion”, “Erbvergabe”, “Wagenkellner” and “zu valieren”. The model also omits the explanation that, under German law, stolen property cannot be acquired in good faith. The correctness score: [[0.2]]
+""",
+}
 
 
 class JudgeSwissLegalTranslation(JudgeLLM):
@@ -199,7 +288,36 @@ def get_swiss_legal_translation_judge(
     judge_model_name: str = "openai/gpt-4o-2024-11-20",
     short_judge_name: str = "slt_judge_gpt-4o",
     backend: str = "litellm",
+    system_style: str = "basic",  # "basic" or "detailed"
+    few_shot_style: str = "diverse",  # "diverse" or "fr-de"
 ):
+    def swiss_legal_translation_judge(question, options, answer, gold):
+        system_prompt = SYSTEM[system_style]
+        user = USER[system_style]
+        few_shot_examples = FEW_SHOT[few_shot_style]
+        instruction = f"""Judge the below case, give the brief reasoning process and the correctness score.
+
+
+Source Text:
+```{question}```
+
+Golden Translation:
+```{gold}```
+
+Model's Translation:
+```{answer}```
+
+Your Judgment:
+"""
+
+        user_prompt = user + few_shot_examples + instruction
+
+        print(user_prompt)
+        print(system_prompt)
+        exit()
+
+        return [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
+
     return SampleLevelMetricGrouping(
         metric_name=[short_judge_name],
         higher_is_better={short_judge_name: True},

From c49e1e23a4519c24fe9f1443671da42d6a429d12 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Mon, 23 Dec 2024 10:25:42 +0100
Subject: [PATCH 37/43] Removed debug relics.

---
 community_tasks/swiss_legal_evals.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index bf43c31bf..ccc18defe 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -309,13 +309,8 @@ def swiss_legal_translation_judge(question, options, answer, gold):
 
 Your Judgment:
 """
-
         user_prompt = user + few_shot_examples + instruction
 
-        print(user_prompt)
-        print(system_prompt)
-        exit()
-
         return [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
 
     return SampleLevelMetricGrouping(

From 4418e82b5b203fe8c041132c62add7189cf54a8f Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Mon, 23 Dec 2024 10:31:49 +0100
Subject: [PATCH 38/43] Fixed issue in judge prompt.

---
 community_tasks/swiss_legal_evals.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index ccc18defe..6f7c8ab57 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -268,7 +268,7 @@ def compute(
         **kwargs,
     ) -> dict[str, float]:
         logger.info(f"Judging {len(formatted_docs)} samples with {self.short_judge_name}...")
-        questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]
+        questions = [formatted_doc.specific["source"] for formatted_doc in formatted_docs]
         options = [formatted_doc.choices for formatted_doc in formatted_docs]
         golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
         predictions = [response[0].result for response in responses]

From 075ebd2ede76419444cbb1d0b4e75fc3185cf4ab Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Mon, 23 Dec 2024 14:54:15 +0100
Subject: [PATCH 39/43] Adapted getting predictions to new way for all metrics.

---
 community_tasks/swiss_legal_evals.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 6f7c8ab57..a6bc260a8 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -348,7 +348,7 @@ def compute(
         # There should be only one language each in the batch
         assert len(set(source_langs)) == len(set(target_langs)) == 1
         sources = [formatted_doc.specific["source"] for formatted_doc in formatted_docs]
-        predictions = [response[0].result[0] for response in responses]
+        predictions = [response[0].result for response in responses]
 
         answers, errors = get_gemba_scores(
             sources, predictions, source_langs[0], target_langs[0], method=self.method, model=self.model
@@ -461,7 +461,7 @@ def compute(
     ) -> dict[str, float]:
         logger.info(f"Scoring {len(formatted_docs)} samples with {self.metric_name}...")
         golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
-        predictions = [response[0].result[0] for response in responses]
+        predictions = [response[0].result for response in responses]
 
         all_scores = []
         for i in range(0, len(golds), self.batch_size):
@@ -531,7 +531,7 @@ def compute(
     ) -> dict[str, float]:
         logger.info(f"Scoring {len(formatted_docs)} samples with {self.metric_name}...")
         golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
-        predictions = [response[0].result[0] for response in responses]
+        predictions = [response[0].result for response in responses]
         sources = [kwargs["formatted_doc"].specific["source"] for kwargs["formatted_doc"] in formatted_docs]
 
         data = [{"src": src, "mt": pred, "ref": gold} for src, pred, gold in zip(sources, predictions, golds)]

From 8ee2dbc7439f2849a581464e064fdaf068e6d778 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Mon, 23 Dec 2024 14:55:22 +0100
Subject: [PATCH 40/43] Added gemba mqm metric by default.

---
 community_tasks/swiss_legal_evals.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index a6bc260a8..6886e7a47 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -852,6 +852,7 @@ def prompt_fn(line: dict, task_name: str = None):
     "bert_score",
     "bleurt_large",
     "xcomet_xxl",
+    "gemba_mqm_gpt_4o",
     "slt_judge_gpt_4o",
 ]
 METRICS = {}

From 4408d0d0ca517276c4735434220cc498759d4d4d Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Wed, 25 Dec 2024 11:25:50 +0100
Subject: [PATCH 41/43] Fixed error in gemba score when errors are no dicts.

---
 community_tasks/swiss_legal_evals.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 6886e7a47..44ce31d77 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -354,10 +354,16 @@ def compute(
             sources, predictions, source_langs[0], target_langs[0], method=self.method, model=self.model
         )
 
-        # Convert defaultdict to dict
-        errors = [[{key: value} for key, value in error.items()] for error in errors]
+        # Handle cases where errors might be nan
+        formatted_errors = []
+        for error in errors:
+            if isinstance(error, dict):
+                # Convert defaultdict to dic
+                formatted_errors.append([{key: value} for key, value in error.items()])
+            else:
+                formatted_errors.append([{"error": ["No error details available"]}])
 
-        return [{self.name: answer, f"{self.name}_errors": error} for answer, error in zip(answers, errors)]
+        return [{self.name: answer, f"{self.name}_errors": error} for answer, error in zip(answers, formatted_errors)]
 
 
 def get_gemba_judge(method: str = "GEMBA-MQM_norm", model: str = "gpt-4o"):

From be6d9abe5a492b7da2ca242569dcaa732f8e26d5 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Wed, 25 Dec 2024 11:33:00 +0100
Subject: [PATCH 42/43] Added different judge configurations for gpt 4o.

---
 community_tasks/swiss_legal_evals.py | 38 +++++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index 44ce31d77..d6d8e61d1 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -900,15 +900,45 @@ def prompt_fn(line: dict, task_name: str = None):
     METRICS["xcomet_xl"] = get_comet(model_name="Unbabel/XCOMET-XL", batch_size=32, gpus=1, device=device)
 if "xcomet_xxl" in METRICS_TO_USE:
     METRICS["xcomet_xxl"] = get_comet(model_name="Unbabel/XCOMET-XXL", batch_size=16, gpus=1, device=device)
-if "gemba_mqm_gpt_4o" in METRICS_TO_USE:  # TODO: Somehow in long evaluations something is wrong here
+if "gemba_mqm_gpt_4o" in METRICS_TO_USE:
     METRICS["gemba_mqm_gpt_4o"] = get_gemba_judge(method="GEMBA-MQM_norm", model="gpt-4o")
 if "slt_judge_gpt_4o_mini" in METRICS_TO_USE:
     METRICS["slt_judge_gpt_4o_mini"] = get_swiss_legal_translation_judge(
-        judge_model_name="openai/gpt-4o-mini-2024-07-18", short_judge_name="slt_judge_gpt-4o-mini"
+        judge_model_name="openai/gpt-4o-mini-2024-07-18",
+        short_judge_name="slt_judge_gpt-4o-mini",
     )
 if "slt_judge_gpt_4o" in METRICS_TO_USE:
-    METRICS["slt_judge_gpt_4o"] = get_swiss_legal_translation_judge(
-        judge_model_name="openai/gpt-4o-2024-11-20", short_judge_name="slt_judge_gpt-4o"
+    METRICS["slt_judge_gpt_4o_basic_diverse"] = get_swiss_legal_translation_judge(
+        judge_model_name="openai/gpt-4o-2024-11-20",
+        short_judge_name="slt_judge_gpt-4o",
+    )
+if "slt_judge_gpt_4o_basic_diverse" in METRICS_TO_USE:
+    METRICS["slt_judge_gpt_4o_basic_diverse"] = get_swiss_legal_translation_judge(
+        judge_model_name="openai/gpt-4o-2024-11-20",
+        short_judge_name="slt_judge_gpt-4o-basic-diverse",
+        system_style="basic",
+        few_shot_style="diverse",
+    )
+if "slt_judge_gpt_4o_basic_fr-de" in METRICS_TO_USE:
+    METRICS["slt_judge_gpt_4o_basic_fr-de"] = get_swiss_legal_translation_judge(
+        judge_model_name="openai/gpt-4o-2024-11-20",
+        short_judge_name="slt_judge_gpt-4o-basic-fr-de",
+        system_style="basic",
+        few_shot_style="fr-de",
+    )
+if "slt_judge_gpt_4o_detailed_diverse" in METRICS_TO_USE:
+    METRICS["slt_judge_gpt_4o_detailed_diverse"] = get_swiss_legal_translation_judge(
+        judge_model_name="openai/gpt-4o-2024-11-20",
+        short_judge_name="slt_judge_gpt-4o-detailed-diverse",
+        system_style="detailed",
+        few_shot_style="diverse",
+    )
+if "slt_judge_gpt_4o_detailed_fr-de" in METRICS_TO_USE:
+    METRICS["slt_judge_gpt_4o_detailed_fr-de"] = get_swiss_legal_translation_judge(
+        judge_model_name="openai/gpt-4o-2024-11-20",
+        short_judge_name="slt_judge_gpt-4o-detailed-fr-de",
+        system_style="detailed",
+        few_shot_style="fr-de",
     )
 if "slt_judge_haiku_35" in METRICS_TO_USE:
     METRICS["slt_judge_haiku_35"] = get_swiss_legal_translation_judge(

From c7ca83f50659241cc60b9b1febc99800f8e9a8a4 Mon Sep 17 00:00:00 2001
From: Joel Niklaus <joel@harvey.ai>
Date: Wed, 25 Dec 2024 23:07:42 +0100
Subject: [PATCH 43/43] Fixed typo.

---
 community_tasks/swiss_legal_evals.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py
index d6d8e61d1..7bcb77c79 100644
--- a/community_tasks/swiss_legal_evals.py
+++ b/community_tasks/swiss_legal_evals.py
@@ -908,7 +908,7 @@ def prompt_fn(line: dict, task_name: str = None):
         short_judge_name="slt_judge_gpt-4o-mini",
     )
 if "slt_judge_gpt_4o" in METRICS_TO_USE:
-    METRICS["slt_judge_gpt_4o_basic_diverse"] = get_swiss_legal_translation_judge(
+    METRICS["slt_judge_gpt_4o"] = get_swiss_legal_translation_judge(
         judge_model_name="openai/gpt-4o-2024-11-20",
         short_judge_name="slt_judge_gpt-4o",
     )