Skip to content

Commit 4c671bb

Browse files
committed
style
1 parent fe9773e commit 4c671bb

File tree

3 files changed

+44
-44
lines changed

3 files changed

+44
-44
lines changed

src/lighteval/metrics/metrics.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,12 @@
1313
from lighteval.metrics.metrics_sample import (
1414
BLEU,
1515
BLEURT,
16+
MRR,
1617
ROUGE,
1718
BertScore,
1819
ExactMatches,
1920
F1_score,
2021
LoglikelihoodAcc,
21-
MRR,
2222
Recall,
2323
StringDistance,
2424
acc_golds_likelihood,

src/lighteval/metrics/metrics_corpus.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
import sklearn.metrics
1010

1111
from lighteval.metrics.sample_preparator import (
12-
LogprobCorpusMetricInput,
1312
GenerativeCorpusMetricInput,
13+
LogprobCorpusMetricInput,
1414
PerplexityCorpusMetricInput,
1515
)
1616
from lighteval.utils import as_list
@@ -41,7 +41,7 @@ def __init__(self, average: str, num_classes: int = 2):
4141
"""
4242
if self.average not in ["weighted", "macro", "micro"]:
4343
raise ValueError(f"A CorpusLevelF1Score must be initialized with weighted, macro, micro as an average function. {average} was used.")
44-
self.average = average
44+
self.average = average
4545
self.num_classes = num_classes
4646

4747
def compute(self, items: list[LogprobCorpusMetricInput]):
@@ -87,12 +87,12 @@ def compute(self, items: list[GenerativeCorpusMetricInput]) -> float:
8787

8888
class CorpusLevelPerplexityMetric:
8989
def __init__(self, metric_type: str):
90-
"""Stores the relevant parameter for a corpus level perplexity metric.
91-
Perplexity metrics compute more or less the same thing, which is a variation on the
92-
average of log-probabilities over a sequence, but the normalization and processing applied
90+
"""Stores the relevant parameter for a corpus level perplexity metric.
91+
Perplexity metrics compute more or less the same thing, which is a variation on the
92+
average of log-probabilities over a sequence, but the normalization and processing applied
9393
is different depending on the metric type.
94-
Perplexity uses an exponential and no weights for the average, weighted perplexity uses an exponential
95-
and the number of words as weights for the log-prob average, and bits per byte uses the number of bits
94+
Perplexity uses an exponential and no weights for the average, weighted perplexity uses an exponential
95+
and the number of words as weights for the log-prob average, and bits per byte uses the number of bits
9696
for normalization and divides the results by log(2).
9797
9898
Args:

src/lighteval/metrics/metrics_sample.py

+36-36
Original file line numberDiff line numberDiff line change
@@ -31,17 +31,17 @@ def __init__(
3131
"""An exact match class.
3232
3333
Args:
34-
aggregation_function (callable, optional): How to aggregate the item results. Defaults to max.
35-
Used if there are several golds or predictions on which scores were computed.
36-
normalize_gold (callable, optional): Function to use to normalize the reference strings.
34+
aggregation_function (callable, optional): How to aggregate the item results. Defaults to max.
35+
Used if there are several golds or predictions on which scores were computed.
36+
normalize_gold (callable, optional): Function to use to normalize the reference strings.
3737
Defaults to None if no normalization is applied.
38-
normalize_pred (callable, optional): Function to use to normalize the predicted strings.
38+
normalize_pred (callable, optional): Function to use to normalize the predicted strings.
3939
Defaults to None if no normalization is applied.
4040
strip_strings (bool, optional): Whether to strip both reference and predictions. Defaults to False.
41-
type_exact_match (str, optional): Defines what type of match to apply (post normalization if present).
41+
type_exact_match (str, optional): Defines what type of match to apply (post normalization if present).
4242
Can be any of `prefix`, `suffix` or `full`. Defaults to "full".
43-
`prefix` checks if the prediction starts with the gold,
44-
`suffix` if the prediction ends with the gold,
43+
`prefix` checks if the prediction starts with the gold,
44+
`suffix` if the prediction ends with the gold,
4545
`full` if the prediction and gold are equal
4646
"""
4747
if aggregation_function is None:
@@ -87,7 +87,7 @@ def compute_one_item(
8787
pred (str): One of the possible predictions
8888
8989
Returns:
90-
float: The exact match score. Will be 1 for a match, 0 otherwise.
90+
float: The exact match score. Will be 1 for a match, 0 otherwise.
9191
"""
9292
if not pred:
9393
return 0
@@ -116,14 +116,14 @@ def __init__(
116116
normalize_pred: callable = None,
117117
strip_strings: bool = False,
118118
):
119-
"""An F1 score class. F1 is computed over the bag of words of the golds and predictions.
119+
"""An F1 score class. F1 is computed over the bag of words of the golds and predictions.
120120
121121
Args:
122-
aggregation_function (callable, optional): How to aggregate the item results. Defaults to max.
123-
Used if there are several golds or predictions on which scores were computed.
124-
normalize_gold (callable, optional): Function to use to normalize the reference strings.
122+
aggregation_function (callable, optional): How to aggregate the item results. Defaults to max.
123+
Used if there are several golds or predictions on which scores were computed.
124+
normalize_gold (callable, optional): Function to use to normalize the reference strings.
125125
Defaults to None if no normalization is applied.
126-
normalize_pred (callable, optional): Function to use to normalize the predicted strings.
126+
normalize_pred (callable, optional): Function to use to normalize the predicted strings.
127127
Defaults to None if no normalization is applied.
128128
strip_strings (bool, optional): Whether to strip both reference and predictions. Defaults to False.
129129
"""
@@ -180,14 +180,14 @@ def compute_one_item(self, gold: str, pred: str) -> float:
180180

181181
class LoglikelihoodAcc:
182182
def __init__(self, length_normalization: bool = False, ignore_first_space: bool = False) -> None:
183-
"""Log likelihood accuracy class. It tests if the highest log-probability of the possible choices
183+
"""Log likelihood accuracy class. It tests if the highest log-probability of the possible choices
184184
is actually in the gold ones.
185185
186186
Args:
187187
length_normalization (bool, optional): Whether log-likelihood scores should be normalized for sentence length. Defaults to False.
188188
Should be True for most cases.
189189
ignore_first_space (bool, optional): Whether to ignore the first token's log prob (if it's a space only). Defaults to False.
190-
Only case when it should be True is when the possible choices (for example `A`,`B` ...) have an extra
190+
Only case when it should be True is when the possible choices (for example `A`,`B` ...) have an extra
191191
space added in front of them to manage tokenization issues (` A`, ` B`, ...) for some models.
192192
"""
193193
self.length_normalization = length_normalization
@@ -200,7 +200,7 @@ def compute(self, gold_ixs: list[int], choices_logprob: list[float], formatted_d
200200
Args:
201201
gold_ixs (list[int]): All the gold choices indices
202202
choices_logprob (list[float]): Summed log-probabilities of all the possible choices for the model, ordered as the choices.
203-
formatted_doc (Doc): Original document for the sample.
203+
formatted_doc (Doc): Original document for the sample.
204204
Used to get the original choices's length for possible normalisation
205205
206206
Returns:
@@ -227,13 +227,13 @@ def __init__(self, at: int) -> None:
227227
"""Recall metric class. It checks if the top `at` best choices include one of the golds or not.
228228
229229
Args:
230-
at (int): Depth level of the recall.
230+
at (int): Depth level of the recall.
231231
Recall at 1 is equivalent to a logprob accuracy without normalization.
232232
"""
233233
self.recall_depth = at
234234

235235
def compute(self, choices_logprob: list[float], gold_ixs: list[int], **kwargs) -> int:
236-
"""Computes the recall at the requested depth level: looks at the `n` best predicted choices (with the
236+
"""Computes the recall at the requested depth level: looks at the `n` best predicted choices (with the
237237
highest log probabilies) and see if there is an actual gold among them.
238238
239239
Args:
@@ -250,7 +250,7 @@ def compute(self, choices_logprob: list[float], gold_ixs: list[int], **kwargs) -
250250

251251
class MRR:
252252
def __init__(self, length_normalization: bool = False):
253-
"""A mean reciprocal rank class.
253+
"""A mean reciprocal rank class.
254254
255255
Args:
256256
length_normalization (bool, optional): Whether to use normalisation be choice length when computing the best log-probabilities. Defaults to False.
@@ -263,11 +263,11 @@ def compute(self, choices_logprob: list[float], gold_ixs: list[float], formatted
263263
Args:
264264
gold_ixs (list[int]): All the gold choices indices
265265
choices_logprob (list[float]): Summed log-probabilities of all the possible choices for the model, ordered as the choices.
266-
formatted_doc (Doc): Original document for the sample.
266+
formatted_doc (Doc): Original document for the sample.
267267
Used to get the original choices's length for possible normalisation
268268
269269
Returns:
270-
float: MRR score.
270+
float: MRR score.
271271
"""
272272
if self.length_normalization:
273273
choices_logprob = [choices_logprob[ix] / len(formatted_doc.choices[ix]) for ix in len(choices_logprob)]
@@ -304,14 +304,14 @@ def __init__(
304304
305305
Args:
306306
methods (str | list[str]): What type of ROUGE scoring to use. Can be one or any of `rouge1`, `rouge2`, `rougeL` or `rougeLsum`.
307-
multiple_golds (bool, optional): Whether to compute ROUGE by allowing the comparision to several golds
307+
multiple_golds (bool, optional): Whether to compute ROUGE by allowing the comparision to several golds
308308
at once, or to compute ROUGE on individual gold/prediction pairs and aggregate afterwards. Defaults to False.
309309
bootstrap (bool, optional): Whether to use bootstrapping. Defaults to False.
310-
aggregation_function (callable, optional): How to aggregate the item results. Defaults to max.
311-
Used if there are several golds or predictions on which scores were computed.
312-
normalize_gold (callable, optional): Function to use to normalize the reference strings.
310+
aggregation_function (callable, optional): How to aggregate the item results. Defaults to max.
311+
Used if there are several golds or predictions on which scores were computed.
312+
normalize_gold (callable, optional): Function to use to normalize the reference strings.
313313
Defaults to None if no normalization is applied.
314-
normalize_pred (callable, optional): Function to use to normalize the predicted strings.
314+
normalize_pred (callable, optional): Function to use to normalize the predicted strings.
315315
Defaults to None if no normalization is applied.
316316
"""
317317
if aggregation_function and bootstrap:
@@ -339,7 +339,7 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float |
339339
predictions (list[str]): Predicted strings
340340
341341
Returns:
342-
float or dict: Aggregated score over the current sample's items.
342+
float or dict: Aggregated score over the current sample's items.
343343
If several rouge functions have been selected, returns a dict which maps name and scores.
344344
"""
345345
# Normalize
@@ -395,9 +395,9 @@ def __init__(
395395
`microsoft/deberta-large-mnli` as scorer
396396
397397
Args:
398-
normalize_gold (callable, optional): Function to use to normalize the reference strings.
398+
normalize_gold (callable, optional): Function to use to normalize the reference strings.
399399
Defaults to None if no normalization is applied.
400-
normalize_pred (callable, optional): Function to use to normalize the predicted strings.
400+
normalize_pred (callable, optional): Function to use to normalize the predicted strings.
401401
Defaults to None if no normalization is applied.
402402
"""
403403
self.bert_scorer = BERTScorer(
@@ -415,7 +415,7 @@ def compute(self, golds: list[str], predictions: list[str]) -> dict:
415415
predictions (list[str]): Predicted strings
416416
417417
Returns:
418-
dict: Scores over the current sample's items.
418+
dict: Scores over the current sample's items.
419419
"""
420420
golds = as_list(golds)
421421
predictions = as_list(predictions)
@@ -430,7 +430,7 @@ def compute(self, golds: list[str], predictions: list[str]) -> dict:
430430
return {"BERTScore-P": p[0].item(), "BERTScore-R": r[0].item(), "BERTScore-F": f[0].item()}
431431

432432

433-
# todo: make into clean classes with call to normalizer
433+
# todo: make into clean classes with call to normalizer
434434
def extractiveness(formatted_doc: Doc, predictions: list[str], **kwargs):
435435
inp = remove_braces(formatted_doc.specific["text"])
436436
pred = remove_braces_and_strip(predictions[0])
@@ -442,7 +442,7 @@ def extractiveness(formatted_doc: Doc, predictions: list[str], **kwargs):
442442
}
443443

444444

445-
# todo: make into clean classes with call to normalizer
445+
# todo: make into clean classes with call to normalizer
446446
def faithfulness(formatted_doc: Doc, predictions: list[str], **kwargs):
447447
inp = remove_braces(formatted_doc.specific["text"])
448448
pred = remove_braces_and_strip(predictions[0])
@@ -467,7 +467,7 @@ def compute(self, golds: list[str], predictions: list[str]) -> float:
467467
predictions (list[str]): Predicted strings
468468
469469
Returns:
470-
float: Score over the current sample's items.
470+
float: Score over the current sample's items.
471471
"""
472472
if len(predictions) == 1:
473473
predictions = predictions * len(golds)
@@ -478,7 +478,7 @@ def compute(self, golds: list[str], predictions: list[str]) -> float:
478478

479479
class BLEU:
480480
def __init__(self, n_gram: int):
481-
"""BLEU scorer class. Relies on `nltk`'s sentencebleu for scoring.
481+
"""BLEU scorer class. Relies on `nltk`'s sentencebleu for scoring.
482482
TODO: Will have to move this to sacrebleu.
483483
484484
Args:
@@ -494,7 +494,7 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs):
494494
predictions (list[str]): Predicted strings
495495
496496
Returns:
497-
float: Score over the current sample's items.
497+
float: Score over the current sample's items.
498498
"""
499499
return np.mean([self._bleu_score(golds, p) for p in predictions])
500500

@@ -506,7 +506,7 @@ def _bleu_score(self, gold: list[str], pred: str) -> float:
506506
predictions (str): One of the predicted strings
507507
508508
Returns:
509-
float: Score over the current prediction.
509+
float: Score over the current prediction.
510510
"""
511511
weights = [1 if ix == self.n_gram else 0 for ix in range(1, 5)]
512512
return sentence_bleu([word_tokenize(g) for g in gold], word_tokenize(pred), weights=weights)

0 commit comments

Comments
 (0)