@@ -31,17 +31,17 @@ def __init__(
31
31
"""An exact match class.
32
32
33
33
Args:
34
- aggregation_function (callable, optional): How to aggregate the item results. Defaults to max.
35
- Used if there are several golds or predictions on which scores were computed.
36
- normalize_gold (callable, optional): Function to use to normalize the reference strings.
34
+ aggregation_function (callable, optional): How to aggregate the item results. Defaults to max.
35
+ Used if there are several golds or predictions on which scores were computed.
36
+ normalize_gold (callable, optional): Function to use to normalize the reference strings.
37
37
Defaults to None if no normalization is applied.
38
- normalize_pred (callable, optional): Function to use to normalize the predicted strings.
38
+ normalize_pred (callable, optional): Function to use to normalize the predicted strings.
39
39
Defaults to None if no normalization is applied.
40
40
strip_strings (bool, optional): Whether to strip both reference and predictions. Defaults to False.
41
- type_exact_match (str, optional): Defines what type of match to apply (post normalization if present).
41
+ type_exact_match (str, optional): Defines what type of match to apply (post normalization if present).
42
42
Can be any of `prefix`, `suffix` or `full`. Defaults to "full".
43
- `prefix` checks if the prediction starts with the gold,
44
- `suffix` if the prediction ends with the gold,
43
+ `prefix` checks if the prediction starts with the gold,
44
+ `suffix` if the prediction ends with the gold,
45
45
`full` if the prediction and gold are equal
46
46
"""
47
47
if aggregation_function is None :
@@ -87,7 +87,7 @@ def compute_one_item(
87
87
pred (str): One of the possible predictions
88
88
89
89
Returns:
90
- float: The exact match score. Will be 1 for a match, 0 otherwise.
90
+ float: The exact match score. Will be 1 for a match, 0 otherwise.
91
91
"""
92
92
if not pred :
93
93
return 0
@@ -116,14 +116,14 @@ def __init__(
116
116
normalize_pred : callable = None ,
117
117
strip_strings : bool = False ,
118
118
):
119
- """An F1 score class. F1 is computed over the bag of words of the golds and predictions.
119
+ """An F1 score class. F1 is computed over the bag of words of the golds and predictions.
120
120
121
121
Args:
122
- aggregation_function (callable, optional): How to aggregate the item results. Defaults to max.
123
- Used if there are several golds or predictions on which scores were computed.
124
- normalize_gold (callable, optional): Function to use to normalize the reference strings.
122
+ aggregation_function (callable, optional): How to aggregate the item results. Defaults to max.
123
+ Used if there are several golds or predictions on which scores were computed.
124
+ normalize_gold (callable, optional): Function to use to normalize the reference strings.
125
125
Defaults to None if no normalization is applied.
126
- normalize_pred (callable, optional): Function to use to normalize the predicted strings.
126
+ normalize_pred (callable, optional): Function to use to normalize the predicted strings.
127
127
Defaults to None if no normalization is applied.
128
128
strip_strings (bool, optional): Whether to strip both reference and predictions. Defaults to False.
129
129
"""
@@ -180,14 +180,14 @@ def compute_one_item(self, gold: str, pred: str) -> float:
180
180
181
181
class LoglikelihoodAcc :
182
182
def __init__ (self , length_normalization : bool = False , ignore_first_space : bool = False ) -> None :
183
- """Log likelihood accuracy class. It tests if the highest log-probability of the possible choices
183
+ """Log likelihood accuracy class. It tests if the highest log-probability of the possible choices
184
184
is actually in the gold ones.
185
185
186
186
Args:
187
187
length_normalization (bool, optional): Whether log-likelihood scores should be normalized for sentence length. Defaults to False.
188
188
Should be True for most cases.
189
189
ignore_first_space (bool, optional): Whether to ignore the first token's log prob (if it's a space only). Defaults to False.
190
- Only case when it should be True is when the possible choices (for example `A`,`B` ...) have an extra
190
+ Only case when it should be True is when the possible choices (for example `A`,`B` ...) have an extra
191
191
space added in front of them to manage tokenization issues (` A`, ` B`, ...) for some models.
192
192
"""
193
193
self .length_normalization = length_normalization
@@ -200,7 +200,7 @@ def compute(self, gold_ixs: list[int], choices_logprob: list[float], formatted_d
200
200
Args:
201
201
gold_ixs (list[int]): All the gold choices indices
202
202
choices_logprob (list[float]): Summed log-probabilities of all the possible choices for the model, ordered as the choices.
203
- formatted_doc (Doc): Original document for the sample.
203
+ formatted_doc (Doc): Original document for the sample.
204
204
Used to get the original choices's length for possible normalisation
205
205
206
206
Returns:
@@ -227,13 +227,13 @@ def __init__(self, at: int) -> None:
227
227
"""Recall metric class. It checks if the top `at` best choices include one of the golds or not.
228
228
229
229
Args:
230
- at (int): Depth level of the recall.
230
+ at (int): Depth level of the recall.
231
231
Recall at 1 is equivalent to a logprob accuracy without normalization.
232
232
"""
233
233
self .recall_depth = at
234
234
235
235
def compute (self , choices_logprob : list [float ], gold_ixs : list [int ], ** kwargs ) -> int :
236
- """Computes the recall at the requested depth level: looks at the `n` best predicted choices (with the
236
+ """Computes the recall at the requested depth level: looks at the `n` best predicted choices (with the
237
237
highest log probabilies) and see if there is an actual gold among them.
238
238
239
239
Args:
@@ -250,7 +250,7 @@ def compute(self, choices_logprob: list[float], gold_ixs: list[int], **kwargs) -
250
250
251
251
class MRR :
252
252
def __init__ (self , length_normalization : bool = False ):
253
- """A mean reciprocal rank class.
253
+ """A mean reciprocal rank class.
254
254
255
255
Args:
256
256
length_normalization (bool, optional): Whether to use normalisation be choice length when computing the best log-probabilities. Defaults to False.
@@ -263,11 +263,11 @@ def compute(self, choices_logprob: list[float], gold_ixs: list[float], formatted
263
263
Args:
264
264
gold_ixs (list[int]): All the gold choices indices
265
265
choices_logprob (list[float]): Summed log-probabilities of all the possible choices for the model, ordered as the choices.
266
- formatted_doc (Doc): Original document for the sample.
266
+ formatted_doc (Doc): Original document for the sample.
267
267
Used to get the original choices's length for possible normalisation
268
268
269
269
Returns:
270
- float: MRR score.
270
+ float: MRR score.
271
271
"""
272
272
if self .length_normalization :
273
273
choices_logprob = [choices_logprob [ix ] / len (formatted_doc .choices [ix ]) for ix in len (choices_logprob )]
@@ -304,14 +304,14 @@ def __init__(
304
304
305
305
Args:
306
306
methods (str | list[str]): What type of ROUGE scoring to use. Can be one or any of `rouge1`, `rouge2`, `rougeL` or `rougeLsum`.
307
- multiple_golds (bool, optional): Whether to compute ROUGE by allowing the comparision to several golds
307
+ multiple_golds (bool, optional): Whether to compute ROUGE by allowing the comparision to several golds
308
308
at once, or to compute ROUGE on individual gold/prediction pairs and aggregate afterwards. Defaults to False.
309
309
bootstrap (bool, optional): Whether to use bootstrapping. Defaults to False.
310
- aggregation_function (callable, optional): How to aggregate the item results. Defaults to max.
311
- Used if there are several golds or predictions on which scores were computed.
312
- normalize_gold (callable, optional): Function to use to normalize the reference strings.
310
+ aggregation_function (callable, optional): How to aggregate the item results. Defaults to max.
311
+ Used if there are several golds or predictions on which scores were computed.
312
+ normalize_gold (callable, optional): Function to use to normalize the reference strings.
313
313
Defaults to None if no normalization is applied.
314
- normalize_pred (callable, optional): Function to use to normalize the predicted strings.
314
+ normalize_pred (callable, optional): Function to use to normalize the predicted strings.
315
315
Defaults to None if no normalization is applied.
316
316
"""
317
317
if aggregation_function and bootstrap :
@@ -339,7 +339,7 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float |
339
339
predictions (list[str]): Predicted strings
340
340
341
341
Returns:
342
- float or dict: Aggregated score over the current sample's items.
342
+ float or dict: Aggregated score over the current sample's items.
343
343
If several rouge functions have been selected, returns a dict which maps name and scores.
344
344
"""
345
345
# Normalize
@@ -395,9 +395,9 @@ def __init__(
395
395
`microsoft/deberta-large-mnli` as scorer
396
396
397
397
Args:
398
- normalize_gold (callable, optional): Function to use to normalize the reference strings.
398
+ normalize_gold (callable, optional): Function to use to normalize the reference strings.
399
399
Defaults to None if no normalization is applied.
400
- normalize_pred (callable, optional): Function to use to normalize the predicted strings.
400
+ normalize_pred (callable, optional): Function to use to normalize the predicted strings.
401
401
Defaults to None if no normalization is applied.
402
402
"""
403
403
self .bert_scorer = BERTScorer (
@@ -415,7 +415,7 @@ def compute(self, golds: list[str], predictions: list[str]) -> dict:
415
415
predictions (list[str]): Predicted strings
416
416
417
417
Returns:
418
- dict: Scores over the current sample's items.
418
+ dict: Scores over the current sample's items.
419
419
"""
420
420
golds = as_list (golds )
421
421
predictions = as_list (predictions )
@@ -430,7 +430,7 @@ def compute(self, golds: list[str], predictions: list[str]) -> dict:
430
430
return {"BERTScore-P" : p [0 ].item (), "BERTScore-R" : r [0 ].item (), "BERTScore-F" : f [0 ].item ()}
431
431
432
432
433
- # todo: make into clean classes with call to normalizer
433
+ # todo: make into clean classes with call to normalizer
434
434
def extractiveness (formatted_doc : Doc , predictions : list [str ], ** kwargs ):
435
435
inp = remove_braces (formatted_doc .specific ["text" ])
436
436
pred = remove_braces_and_strip (predictions [0 ])
@@ -442,7 +442,7 @@ def extractiveness(formatted_doc: Doc, predictions: list[str], **kwargs):
442
442
}
443
443
444
444
445
- # todo: make into clean classes with call to normalizer
445
+ # todo: make into clean classes with call to normalizer
446
446
def faithfulness (formatted_doc : Doc , predictions : list [str ], ** kwargs ):
447
447
inp = remove_braces (formatted_doc .specific ["text" ])
448
448
pred = remove_braces_and_strip (predictions [0 ])
@@ -467,7 +467,7 @@ def compute(self, golds: list[str], predictions: list[str]) -> float:
467
467
predictions (list[str]): Predicted strings
468
468
469
469
Returns:
470
- float: Score over the current sample's items.
470
+ float: Score over the current sample's items.
471
471
"""
472
472
if len (predictions ) == 1 :
473
473
predictions = predictions * len (golds )
@@ -478,7 +478,7 @@ def compute(self, golds: list[str], predictions: list[str]) -> float:
478
478
479
479
class BLEU :
480
480
def __init__ (self , n_gram : int ):
481
- """BLEU scorer class. Relies on `nltk`'s sentencebleu for scoring.
481
+ """BLEU scorer class. Relies on `nltk`'s sentencebleu for scoring.
482
482
TODO: Will have to move this to sacrebleu.
483
483
484
484
Args:
@@ -494,7 +494,7 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs):
494
494
predictions (list[str]): Predicted strings
495
495
496
496
Returns:
497
- float: Score over the current sample's items.
497
+ float: Score over the current sample's items.
498
498
"""
499
499
return np .mean ([self ._bleu_score (golds , p ) for p in predictions ])
500
500
@@ -506,7 +506,7 @@ def _bleu_score(self, gold: list[str], pred: str) -> float:
506
506
predictions (str): One of the predicted strings
507
507
508
508
Returns:
509
- float: Score over the current prediction.
509
+ float: Score over the current prediction.
510
510
"""
511
511
weights = [1 if ix == self .n_gram else 0 for ix in range (1 , 5 )]
512
512
return sentence_bleu ([word_tokenize (g ) for g in gold ], word_tokenize (pred ), weights = weights )
0 commit comments