diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index ab190e6b6..df9af332a 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -13,12 +13,12 @@ from lighteval.metrics.metrics_sample import ( BLEU, BLEURT, + MRR, ROUGE, BertScore, ExactMatches, F1_score, LoglikelihoodAcc, - MRR, Recall, StringDistance, acc_golds_likelihood, diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py index aa0285360..895acee85 100644 --- a/src/lighteval/metrics/metrics_corpus.py +++ b/src/lighteval/metrics/metrics_corpus.py @@ -9,8 +9,8 @@ import sklearn.metrics from lighteval.metrics.sample_preparator import ( - LogprobCorpusMetricInput, GenerativeCorpusMetricInput, + LogprobCorpusMetricInput, PerplexityCorpusMetricInput, ) from lighteval.utils import as_list @@ -41,7 +41,7 @@ def __init__(self, average: str, num_classes: int = 2): """ if self.average not in ["weighted", "macro", "micro"]: raise ValueError(f"A CorpusLevelF1Score must be initialized with weighted, macro, micro as an average function. {average} was used.") - self.average = average + self.average = average self.num_classes = num_classes def compute(self, items: list[LogprobCorpusMetricInput]): @@ -87,12 +87,12 @@ def compute(self, items: list[GenerativeCorpusMetricInput]) -> float: class CorpusLevelPerplexityMetric: def __init__(self, metric_type: str): - """Stores the relevant parameter for a corpus level perplexity metric. - Perplexity metrics compute more or less the same thing, which is a variation on the - average of log-probabilities over a sequence, but the normalization and processing applied + """Stores the relevant parameter for a corpus level perplexity metric. + Perplexity metrics compute more or less the same thing, which is a variation on the + average of log-probabilities over a sequence, but the normalization and processing applied is different depending on the metric type. - Perplexity uses an exponential and no weights for the average, weighted perplexity uses an exponential - and the number of words as weights for the log-prob average, and bits per byte uses the number of bits + Perplexity uses an exponential and no weights for the average, weighted perplexity uses an exponential + and the number of words as weights for the log-prob average, and bits per byte uses the number of bits for normalization and divides the results by log(2). Args: diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index b6ddbb6cf..ec123741b 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -31,17 +31,17 @@ def __init__( """An exact match class. Args: - aggregation_function (callable, optional): How to aggregate the item results. Defaults to max. - Used if there are several golds or predictions on which scores were computed. - normalize_gold (callable, optional): Function to use to normalize the reference strings. + aggregation_function (callable, optional): How to aggregate the item results. Defaults to max. + Used if there are several golds or predictions on which scores were computed. + normalize_gold (callable, optional): Function to use to normalize the reference strings. Defaults to None if no normalization is applied. - normalize_pred (callable, optional): Function to use to normalize the predicted strings. + normalize_pred (callable, optional): Function to use to normalize the predicted strings. Defaults to None if no normalization is applied. strip_strings (bool, optional): Whether to strip both reference and predictions. Defaults to False. - type_exact_match (str, optional): Defines what type of match to apply (post normalization if present). + type_exact_match (str, optional): Defines what type of match to apply (post normalization if present). Can be any of `prefix`, `suffix` or `full`. Defaults to "full". - `prefix` checks if the prediction starts with the gold, - `suffix` if the prediction ends with the gold, + `prefix` checks if the prediction starts with the gold, + `suffix` if the prediction ends with the gold, `full` if the prediction and gold are equal """ if aggregation_function is None: @@ -87,7 +87,7 @@ def compute_one_item( pred (str): One of the possible predictions Returns: - float: The exact match score. Will be 1 for a match, 0 otherwise. + float: The exact match score. Will be 1 for a match, 0 otherwise. """ if not pred: return 0 @@ -116,14 +116,14 @@ def __init__( normalize_pred: callable = None, strip_strings: bool = False, ): - """An F1 score class. F1 is computed over the bag of words of the golds and predictions. + """An F1 score class. F1 is computed over the bag of words of the golds and predictions. Args: - aggregation_function (callable, optional): How to aggregate the item results. Defaults to max. - Used if there are several golds or predictions on which scores were computed. - normalize_gold (callable, optional): Function to use to normalize the reference strings. + aggregation_function (callable, optional): How to aggregate the item results. Defaults to max. + Used if there are several golds or predictions on which scores were computed. + normalize_gold (callable, optional): Function to use to normalize the reference strings. Defaults to None if no normalization is applied. - normalize_pred (callable, optional): Function to use to normalize the predicted strings. + normalize_pred (callable, optional): Function to use to normalize the predicted strings. Defaults to None if no normalization is applied. strip_strings (bool, optional): Whether to strip both reference and predictions. Defaults to False. """ @@ -180,14 +180,14 @@ def compute_one_item(self, gold: str, pred: str) -> float: class LoglikelihoodAcc: def __init__(self, length_normalization: bool = False, ignore_first_space: bool = False) -> None: - """Log likelihood accuracy class. It tests if the highest log-probability of the possible choices + """Log likelihood accuracy class. It tests if the highest log-probability of the possible choices is actually in the gold ones. Args: length_normalization (bool, optional): Whether log-likelihood scores should be normalized for sentence length. Defaults to False. Should be True for most cases. ignore_first_space (bool, optional): Whether to ignore the first token's log prob (if it's a space only). Defaults to False. - Only case when it should be True is when the possible choices (for example `A`,`B` ...) have an extra + Only case when it should be True is when the possible choices (for example `A`,`B` ...) have an extra space added in front of them to manage tokenization issues (` A`, ` B`, ...) for some models. """ self.length_normalization = length_normalization @@ -200,7 +200,7 @@ def compute(self, gold_ixs: list[int], choices_logprob: list[float], formatted_d Args: gold_ixs (list[int]): All the gold choices indices choices_logprob (list[float]): Summed log-probabilities of all the possible choices for the model, ordered as the choices. - formatted_doc (Doc): Original document for the sample. + formatted_doc (Doc): Original document for the sample. Used to get the original choices's length for possible normalisation Returns: @@ -227,13 +227,13 @@ def __init__(self, at: int) -> None: """Recall metric class. It checks if the top `at` best choices include one of the golds or not. Args: - at (int): Depth level of the recall. + at (int): Depth level of the recall. Recall at 1 is equivalent to a logprob accuracy without normalization. """ self.recall_depth = at def compute(self, choices_logprob: list[float], gold_ixs: list[int], **kwargs) -> int: - """Computes the recall at the requested depth level: looks at the `n` best predicted choices (with the + """Computes the recall at the requested depth level: looks at the `n` best predicted choices (with the highest log probabilies) and see if there is an actual gold among them. Args: @@ -250,7 +250,7 @@ def compute(self, choices_logprob: list[float], gold_ixs: list[int], **kwargs) - class MRR: def __init__(self, length_normalization: bool = False): - """A mean reciprocal rank class. + """A mean reciprocal rank class. Args: length_normalization (bool, optional): Whether to use normalisation be choice length when computing the best log-probabilities. Defaults to False. @@ -263,11 +263,11 @@ def compute(self, choices_logprob: list[float], gold_ixs: list[float], formatted Args: gold_ixs (list[int]): All the gold choices indices choices_logprob (list[float]): Summed log-probabilities of all the possible choices for the model, ordered as the choices. - formatted_doc (Doc): Original document for the sample. + formatted_doc (Doc): Original document for the sample. Used to get the original choices's length for possible normalisation Returns: - float: MRR score. + float: MRR score. """ if self.length_normalization: choices_logprob = [choices_logprob[ix] / len(formatted_doc.choices[ix]) for ix in len(choices_logprob)] @@ -304,14 +304,14 @@ def __init__( Args: methods (str | list[str]): What type of ROUGE scoring to use. Can be one or any of `rouge1`, `rouge2`, `rougeL` or `rougeLsum`. - multiple_golds (bool, optional): Whether to compute ROUGE by allowing the comparision to several golds + multiple_golds (bool, optional): Whether to compute ROUGE by allowing the comparision to several golds at once, or to compute ROUGE on individual gold/prediction pairs and aggregate afterwards. Defaults to False. bootstrap (bool, optional): Whether to use bootstrapping. Defaults to False. - aggregation_function (callable, optional): How to aggregate the item results. Defaults to max. - Used if there are several golds or predictions on which scores were computed. - normalize_gold (callable, optional): Function to use to normalize the reference strings. + aggregation_function (callable, optional): How to aggregate the item results. Defaults to max. + Used if there are several golds or predictions on which scores were computed. + normalize_gold (callable, optional): Function to use to normalize the reference strings. Defaults to None if no normalization is applied. - normalize_pred (callable, optional): Function to use to normalize the predicted strings. + normalize_pred (callable, optional): Function to use to normalize the predicted strings. Defaults to None if no normalization is applied. """ if aggregation_function and bootstrap: @@ -339,7 +339,7 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float | predictions (list[str]): Predicted strings Returns: - float or dict: Aggregated score over the current sample's items. + float or dict: Aggregated score over the current sample's items. If several rouge functions have been selected, returns a dict which maps name and scores. """ # Normalize @@ -395,9 +395,9 @@ def __init__( `microsoft/deberta-large-mnli` as scorer Args: - normalize_gold (callable, optional): Function to use to normalize the reference strings. + normalize_gold (callable, optional): Function to use to normalize the reference strings. Defaults to None if no normalization is applied. - normalize_pred (callable, optional): Function to use to normalize the predicted strings. + normalize_pred (callable, optional): Function to use to normalize the predicted strings. Defaults to None if no normalization is applied. """ self.bert_scorer = BERTScorer( @@ -415,7 +415,7 @@ def compute(self, golds: list[str], predictions: list[str]) -> dict: predictions (list[str]): Predicted strings Returns: - dict: Scores over the current sample's items. + dict: Scores over the current sample's items. """ golds = as_list(golds) predictions = as_list(predictions) @@ -430,7 +430,7 @@ def compute(self, golds: list[str], predictions: list[str]) -> dict: return {"BERTScore-P": p[0].item(), "BERTScore-R": r[0].item(), "BERTScore-F": f[0].item()} -# todo: make into clean classes with call to normalizer +# todo: make into clean classes with call to normalizer def extractiveness(formatted_doc: Doc, predictions: list[str], **kwargs): inp = remove_braces(formatted_doc.specific["text"]) pred = remove_braces_and_strip(predictions[0]) @@ -442,7 +442,7 @@ def extractiveness(formatted_doc: Doc, predictions: list[str], **kwargs): } -# todo: make into clean classes with call to normalizer +# todo: make into clean classes with call to normalizer def faithfulness(formatted_doc: Doc, predictions: list[str], **kwargs): inp = remove_braces(formatted_doc.specific["text"]) pred = remove_braces_and_strip(predictions[0]) @@ -467,7 +467,7 @@ def compute(self, golds: list[str], predictions: list[str]) -> float: predictions (list[str]): Predicted strings Returns: - float: Score over the current sample's items. + float: Score over the current sample's items. """ if len(predictions) == 1: predictions = predictions * len(golds) @@ -478,7 +478,7 @@ def compute(self, golds: list[str], predictions: list[str]) -> float: class BLEU: def __init__(self, n_gram: int): - """BLEU scorer class. Relies on `nltk`'s sentencebleu for scoring. + """BLEU scorer class. Relies on `nltk`'s sentencebleu for scoring. TODO: Will have to move this to sacrebleu. Args: @@ -494,7 +494,7 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs): predictions (list[str]): Predicted strings Returns: - float: Score over the current sample's items. + float: Score over the current sample's items. """ return np.mean([self._bleu_score(golds, p) for p in predictions]) @@ -506,7 +506,7 @@ def _bleu_score(self, gold: list[str], pred: str) -> float: predictions (str): One of the predicted strings Returns: - float: Score over the current prediction. + float: Score over the current prediction. """ weights = [1 if ix == self.n_gram else 0 for ix in range(1, 5)] return sentence_bleu([word_tokenize(g) for g in gold], word_tokenize(pred), weights=weights)