Skip to content

Commit

Permalink
Merge pull request #2 from huggingface/clem_doc_readme
Browse files Browse the repository at this point in the history
Doc metrics + README
  • Loading branch information
clefourrier authored Jan 26, 2024
2 parents c968089 + d83573d commit 1b41cc4
Show file tree
Hide file tree
Showing 14 changed files with 400 additions and 919 deletions.
99 changes: 50 additions & 49 deletions README.md

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions src/lighteval/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from lighteval.metrics.metrics_sample import (
BLEU,
BLEURT,
MRR,
ROUGE,
BertScore,
ExactMatches,
Expand All @@ -23,7 +24,6 @@
acc_golds_likelihood,
extractiveness,
faithfulness,
mrr,
)
from lighteval.metrics.normalizations import (
bigbench_normalizer,
Expand Down Expand Up @@ -277,7 +277,7 @@ class Metrics(Enum):
)
mrr = SampleLevelMetric(
metric="mrr",
sample_level_fn=mrr,
sample_level_fn=MRR().compute,
category=MetricCategory.MULTICHOICE,
use_case=MetricUseCase.ACCURACY,
corpus_level_fn=np.mean,
Expand Down
47 changes: 38 additions & 9 deletions src/lighteval/metrics/metrics_corpus.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""This module manages all the score aggregations and computations occurring at the corpus level.
"""This module manages all the metrics occurring at the corpus level.
Some metrics (such as corpus BLEU) are not computed at the individual item level, but over all the corpus.
A number of these aggregations come from the EleutherAIHarness
"""
Expand All @@ -10,6 +10,7 @@

from lighteval.metrics.sample_preparator import (
GenerativeCorpusMetricInput,
LogprobCorpusMetricInput,
PerplexityCorpusMetricInput,
)
from lighteval.utils import as_list
Expand All @@ -20,7 +21,7 @@ def matthews_corrcoef(items: list[GenerativeCorpusMetricInput]) -> float:
"""Computes the Matthews Correlation Coefficient, using scikit learn ([doc](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.matthews_corrcoef.html)).
Args:
items (list[dict]): List of the correctly formatted dictionarinput
items (list[dict]): List of GenerativeCorpusMetricInput
Returns:
float: Score
Expand All @@ -32,13 +33,23 @@ def matthews_corrcoef(items: list[GenerativeCorpusMetricInput]) -> float:

class CorpusLevelF1Score:
def __init__(self, average: str, num_classes: int = 2):
# If num_classes > 2, we compute multi_f1_corpus_aggregation
self.average = average # weighted, macro, micro
"""Stores the relevant parameters for the task's corpus level f1 score.
Args:
average (str): Method to use to compute the f1 score. Can be weighted, macro, micro.
num_classes (int, optional): Num of possible choice classes. Defaults to 2. If this parameter is above 2, we'll compute multi f1 corpus score
"""
if average not in ["weighted", "macro", "micro", None]:
raise ValueError(
f"A CorpusLevelF1Score must be initialized with weighted, macro, micro, or None as an average function. {average} was used."
)
self.average = average
self.num_classes = num_classes

def compute(self, items):
golds = [i["golds"] for i in items]
preds = [i["preds"] for i in items]
def compute(self, items: list[LogprobCorpusMetricInput]):
"""Computes the metric score over all the corpus generated items, by using the scikit learn implementation."""
golds = [i.golds for i in items]
preds = [i.preds for i in items]
# Single f1
if self.num_classes == 2:
fscore = sklearn.metrics.f1_score(golds, preds, average=self.average)
Expand All @@ -48,11 +59,16 @@ def compute(self, items):
f1s = []
for i in range(self.num_classes):
f1s.append(sklearn.metrics.f1_score(y_true=golds == i, y_pred=preds == i))
return np.mean(f1s)
return float(np.mean(f1s))


class CorpusLevelTranslationMetric:
def __init__(self, metric_type: str):
"""Stores the relevant parameters for a corpus level translation metric.
Args:
metric_type (str): Can be any of bleu, chrf, or ter depending on the metric to use.
"""
if metric_type == "bleu":
self.metric = sacrebleu.corpus_bleu
elif metric_type == "chrf":
Expand All @@ -63,19 +79,32 @@ def __init__(self, metric_type: str):
raise ValueError(f"Unknown corpus level translation metric type : {metric_type}")

def compute(self, items: list[GenerativeCorpusMetricInput]) -> float:
"""Computes the metric score over all the corpus generated items, by using the sacrebleu implementation."""
golds = [i.golds for i in items]
preds = [as_list(i.preds) for i in items]
return self.metric(hypotheses=preds, references=golds).score
return float(self.metric(hypotheses=preds, references=golds).score)


class CorpusLevelPerplexityMetric:
def __init__(self, metric_type: str):
"""Stores the relevant parameter for a corpus level perplexity metric.
Perplexity metrics compute more or less the same thing, which is a variation on the
average of log-probabilities over a sequence, but the normalization and processing applied
is different depending on the metric type.
Perplexity uses an exponential and no weights for the average, weighted perplexity uses an exponential
and the number of words as weights for the log-prob average, and bits per byte uses the number of bits
for normalization and divides the results by log(2).
Args:
metric_type (str): Can be any of `perplexity`, `weighted_perplexity` or `bits_per_byte`
"""
if metric_type not in ["perplexity", "weighted_perplexity", "bits_per_byte"]:
raise ValueError(f"Unknown corpus level perplexity metric type : {metric_type}")

self.metric_type = metric_type

def compute(self, items: list[PerplexityCorpusMetricInput]):
"""Computes the metric score over all the corpus generated items."""
logprobs = [i.logprobs for i in items]
weights = [i.weights for i in items]

Expand Down
Loading

0 comments on commit 1b41cc4

Please sign in to comment.