style

clefourrier · clefourrier · commit 4c671bb50675 · 2024-01-26T17:33:21.000Z
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
@@ -13,12 +13,12 @@
 from lighteval.metrics.metrics_sample import (
     BLEU,
     BLEURT,
+    MRR,
     ROUGE,
     BertScore,
     ExactMatches,
     F1_score,
     LoglikelihoodAcc,
-    MRR,
     Recall,
     StringDistance,
     acc_golds_likelihood,
diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py
@@ -9,8 +9,8 @@
 import sklearn.metrics
 
 from lighteval.metrics.sample_preparator import (
-    LogprobCorpusMetricInput,
     GenerativeCorpusMetricInput,
+    LogprobCorpusMetricInput,
     PerplexityCorpusMetricInput,
 )
 from lighteval.utils import as_list
@@ -41,7 +41,7 @@ def __init__(self, average: str, num_classes: int = 2):
         """
         if self.average not in ["weighted", "macro", "micro"]:
             raise ValueError(f"A CorpusLevelF1Score must be initialized with weighted, macro, micro as an average function. {average} was used.")
-        self.average = average 
+        self.average = average
         self.num_classes = num_classes
 
     def compute(self, items: list[LogprobCorpusMetricInput]):
@@ -87,12 +87,12 @@ def compute(self, items: list[GenerativeCorpusMetricInput]) -> float:
 
 class CorpusLevelPerplexityMetric:
     def __init__(self, metric_type: str):
-        """Stores the relevant parameter for a corpus level perplexity metric. 
-        Perplexity metrics compute more or less the same thing, which is a variation on the 
-        average of log-probabilities over a sequence, but the normalization and processing applied 
+        """Stores the relevant parameter for a corpus level perplexity metric.
+        Perplexity metrics compute more or less the same thing, which is a variation on the
+        average of log-probabilities over a sequence, but the normalization and processing applied
         is different depending on the metric type.
-        Perplexity uses an exponential and no weights for the average, weighted perplexity uses an exponential 
-        and the number of words as weights for the log-prob average, and bits per byte uses the number of bits 
+        Perplexity uses an exponential and no weights for the average, weighted perplexity uses an exponential
+        and the number of words as weights for the log-prob average, and bits per byte uses the number of bits
         for normalization and divides the results by log(2).
 
         Args:
diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
@@ -31,17 +31,17 @@ def __init__(
         """An exact match class.
 
         Args:
-            aggregation_function (callable, optional): How to aggregate the item results. Defaults to max. 
-                Used if there are several golds or predictions on which scores were computed. 
-            normalize_gold (callable, optional): Function to use to normalize the reference strings. 
+            aggregation_function (callable, optional): How to aggregate the item results. Defaults to max.
+                Used if there are several golds or predictions on which scores were computed.
+            normalize_gold (callable, optional): Function to use to normalize the reference strings.
                 Defaults to None if no normalization is applied.
-            normalize_pred (callable, optional): Function to use to normalize the predicted strings. 
+            normalize_pred (callable, optional): Function to use to normalize the predicted strings.
                 Defaults to None if no normalization is applied.
             strip_strings (bool, optional): Whether to strip both reference and predictions. Defaults to False.
-            type_exact_match (str, optional): Defines what type of match to apply (post normalization if present). 
+            type_exact_match (str, optional): Defines what type of match to apply (post normalization if present).
                 Can be any of `prefix`, `suffix` or `full`. Defaults to "full".
-                `prefix` checks if the prediction starts with the gold, 
-                `suffix` if the prediction ends with the gold, 
+                `prefix` checks if the prediction starts with the gold,
+                `suffix` if the prediction ends with the gold,
                 `full` if the prediction and gold are equal
         """
         if aggregation_function is None:
@@ -87,7 +87,7 @@ def compute_one_item(
             pred (str): One of the possible predictions
 
         Returns:
-            float: The exact match score. Will be 1 for a match, 0 otherwise. 
+            float: The exact match score. Will be 1 for a match, 0 otherwise.
         """
         if not pred:
             return 0
@@ -116,14 +116,14 @@ def __init__(
         normalize_pred: callable = None,
         strip_strings: bool = False,
     ):
-        """An F1 score class. F1 is computed over the bag of words of the golds and predictions. 
+        """An F1 score class. F1 is computed over the bag of words of the golds and predictions.
 
         Args:
-            aggregation_function (callable, optional): How to aggregate the item results. Defaults to max. 
-                Used if there are several golds or predictions on which scores were computed. 
-            normalize_gold (callable, optional): Function to use to normalize the reference strings. 
+            aggregation_function (callable, optional): How to aggregate the item results. Defaults to max.
+                Used if there are several golds or predictions on which scores were computed.
+            normalize_gold (callable, optional): Function to use to normalize the reference strings.
                 Defaults to None if no normalization is applied.
-            normalize_pred (callable, optional): Function to use to normalize the predicted strings. 
+            normalize_pred (callable, optional): Function to use to normalize the predicted strings.
                 Defaults to None if no normalization is applied.
             strip_strings (bool, optional): Whether to strip both reference and predictions. Defaults to False.
         """
@@ -180,14 +180,14 @@ def compute_one_item(self, gold: str, pred: str) -> float:
 
 class LoglikelihoodAcc:
     def __init__(self, length_normalization: bool = False, ignore_first_space: bool = False) -> None:
-        """Log likelihood accuracy class. It tests if the highest log-probability of the possible choices 
+        """Log likelihood accuracy class. It tests if the highest log-probability of the possible choices
         is actually in the gold ones.
 
         Args:
             length_normalization (bool, optional): Whether log-likelihood scores should be normalized for sentence length. Defaults to False.
                 Should be True for most cases.
             ignore_first_space (bool, optional): Whether to ignore the first token's log prob (if it's a space only). Defaults to False.
-                Only case when it should be True is when the possible choices (for example `A`,`B` ...) have an extra 
+                Only case when it should be True is when the possible choices (for example `A`,`B` ...) have an extra
                 space added in front of them to manage tokenization issues (` A`, ` B`, ...) for some models.
         """
         self.length_normalization = length_normalization
@@ -200,7 +200,7 @@ def compute(self, gold_ixs: list[int], choices_logprob: list[float], formatted_d
         Args:
             gold_ixs (list[int]): All the gold choices indices
             choices_logprob (list[float]): Summed log-probabilities of all the possible choices for the model, ordered as the choices.
-            formatted_doc (Doc): Original document for the sample. 
+            formatted_doc (Doc): Original document for the sample.
                 Used to get the original choices's length for possible normalisation
 
         Returns:
@@ -227,13 +227,13 @@ def __init__(self, at: int) -> None:
         """Recall metric class. It checks if the top `at` best choices include one of the golds or not.
 
         Args:
-            at (int): Depth level of the recall. 
+            at (int): Depth level of the recall.
                 Recall at 1 is equivalent to a logprob accuracy without normalization.
         """
         self.recall_depth = at
 
     def compute(self, choices_logprob: list[float], gold_ixs: list[int], **kwargs) -> int:
-        """Computes the recall at the requested depth level: looks at the `n` best predicted choices (with the 
+        """Computes the recall at the requested depth level: looks at the `n` best predicted choices (with the
         highest log probabilies) and see if there is an actual gold among them.
 
         Args:
@@ -250,7 +250,7 @@ def compute(self, choices_logprob: list[float], gold_ixs: list[int], **kwargs) -
 
 class MRR:
     def __init__(self, length_normalization: bool = False):
-        """A mean reciprocal rank class. 
+        """A mean reciprocal rank class.
 
         Args:
             length_normalization (bool, optional): Whether to use normalisation be choice length when computing the best log-probabilities. Defaults to False.
@@ -263,11 +263,11 @@ def compute(self, choices_logprob: list[float], gold_ixs: list[float], formatted
         Args:
             gold_ixs (list[int]): All the gold choices indices
             choices_logprob (list[float]): Summed log-probabilities of all the possible choices for the model, ordered as the choices.
-            formatted_doc (Doc): Original document for the sample. 
+            formatted_doc (Doc): Original document for the sample.
                 Used to get the original choices's length for possible normalisation
 
         Returns:
-            float: MRR score. 
+            float: MRR score.
         """
         if self.length_normalization:
             choices_logprob = [choices_logprob[ix] / len(formatted_doc.choices[ix]) for ix in len(choices_logprob)]
@@ -304,14 +304,14 @@ def __init__(
 
         Args:
             methods (str | list[str]): What type of ROUGE scoring to use. Can be one or any of `rouge1`, `rouge2`, `rougeL` or `rougeLsum`.
-            multiple_golds (bool, optional): Whether to compute ROUGE by allowing the comparision to several golds 
+            multiple_golds (bool, optional): Whether to compute ROUGE by allowing the comparision to several golds
                 at once, or to compute ROUGE on individual gold/prediction pairs and aggregate afterwards. Defaults to False.
             bootstrap (bool, optional): Whether to use bootstrapping. Defaults to False.
-            aggregation_function (callable, optional): How to aggregate the item results. Defaults to max. 
-                Used if there are several golds or predictions on which scores were computed. 
-            normalize_gold (callable, optional): Function to use to normalize the reference strings. 
+            aggregation_function (callable, optional): How to aggregate the item results. Defaults to max.
+                Used if there are several golds or predictions on which scores were computed.
+            normalize_gold (callable, optional): Function to use to normalize the reference strings.
                 Defaults to None if no normalization is applied.
-            normalize_pred (callable, optional): Function to use to normalize the predicted strings. 
+            normalize_pred (callable, optional): Function to use to normalize the predicted strings.
                 Defaults to None if no normalization is applied.
         """
         if aggregation_function and bootstrap:
@@ -339,7 +339,7 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float |
             predictions (list[str]): Predicted strings
 
         Returns:
-            float or dict: Aggregated score over the current sample's items. 
+            float or dict: Aggregated score over the current sample's items.
                 If several rouge functions have been selected, returns a dict which maps name and scores.
         """
         # Normalize
@@ -395,9 +395,9 @@ def __init__(
         `microsoft/deberta-large-mnli` as scorer
 
         Args:
-            normalize_gold (callable, optional): Function to use to normalize the reference strings. 
+            normalize_gold (callable, optional): Function to use to normalize the reference strings.
                 Defaults to None if no normalization is applied.
-            normalize_pred (callable, optional): Function to use to normalize the predicted strings. 
+            normalize_pred (callable, optional): Function to use to normalize the predicted strings.
                 Defaults to None if no normalization is applied.
         """
         self.bert_scorer = BERTScorer(
@@ -415,7 +415,7 @@ def compute(self, golds: list[str], predictions: list[str]) -> dict:
             predictions (list[str]): Predicted strings
 
         Returns:
-            dict: Scores over the current sample's items. 
+            dict: Scores over the current sample's items.
         """
         golds = as_list(golds)
         predictions = as_list(predictions)
@@ -430,7 +430,7 @@ def compute(self, golds: list[str], predictions: list[str]) -> dict:
         return {"BERTScore-P": p[0].item(), "BERTScore-R": r[0].item(), "BERTScore-F": f[0].item()}
 
 
-# todo: make into clean classes with call to normalizer        
+# todo: make into clean classes with call to normalizer
 def extractiveness(formatted_doc: Doc, predictions: list[str], **kwargs):
     inp = remove_braces(formatted_doc.specific["text"])
     pred = remove_braces_and_strip(predictions[0])
@@ -442,7 +442,7 @@ def extractiveness(formatted_doc: Doc, predictions: list[str], **kwargs):
     }
 
 
-# todo: make into clean classes with call to normalizer        
+# todo: make into clean classes with call to normalizer
 def faithfulness(formatted_doc: Doc, predictions: list[str], **kwargs):
     inp = remove_braces(formatted_doc.specific["text"])
     pred = remove_braces_and_strip(predictions[0])
@@ -467,7 +467,7 @@ def compute(self, golds: list[str], predictions: list[str]) -> float:
             predictions (list[str]): Predicted strings
 
         Returns:
-            float: Score over the current sample's items. 
+            float: Score over the current sample's items.
         """
         if len(predictions) == 1:
             predictions = predictions * len(golds)
@@ -478,7 +478,7 @@ def compute(self, golds: list[str], predictions: list[str]) -> float:
 
 class BLEU:
     def __init__(self, n_gram: int):
-        """BLEU scorer class. Relies on `nltk`'s sentencebleu for scoring. 
+        """BLEU scorer class. Relies on `nltk`'s sentencebleu for scoring.
         TODO: Will have to move this to sacrebleu.
 
         Args:
@@ -494,7 +494,7 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs):
             predictions (list[str]): Predicted strings
 
         Returns:
-            float: Score over the current sample's items. 
+            float: Score over the current sample's items.
         """
         return np.mean([self._bleu_score(golds, p) for p in predictions])
 
@@ -506,7 +506,7 @@ def _bleu_score(self, gold: list[str], pred: str) -> float:
             predictions (str): One of the predicted strings
 
         Returns:
-            float: Score over the current prediction. 
+            float: Score over the current prediction.
         """
         weights = [1 if ix == self.n_gram else 0 for ix in range(1, 5)]
         return sentence_bleu([word_tokenize(g) for g in gold], word_tokenize(pred), weights=weights)