Skip to content

Commit

Permalink
Merge branch 'main' into clem_details
Browse files Browse the repository at this point in the history
  • Loading branch information
clefourrier authored Jul 18, 2024
2 parents f892248 + 66ed7a2 commit 8038c8c
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 47 deletions.
11 changes: 5 additions & 6 deletions src/lighteval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def evaluate( # noqa: C901
:return
Dictionary of results
"""
# A request output tupe is a Tuple where the first element is the index of
# A request output tuple is a Tuple where the first element is the index of
# the request for one document of one task i.e.
# task: "arc_easy", doc: "0"# request: "0" -> request_index = 0,
# We can have multiple requests per doc for multi choice tasks for example.
Expand All @@ -75,8 +75,11 @@ def evaluate( # noqa: C901
)
example_id_response_dict: dict[TaskExampleId, list[RequestIndexModelResponseTuple]] = collections.defaultdict(list)

for request_type, requests in requests_dict.items():
for request_type in RequestType:
if request_type not in requests_dict:
continue
hlog(f"Running {request_type} requests")
requests = requests_dict[request_type]
# These are all the request type from the request factory at the moment
if request_type == RequestType.LOGLIKELIHOOD:
full_resps = lm.loglikelihood(requests, override_bs=override_bs)
Expand All @@ -99,10 +102,6 @@ def evaluate( # noqa: C901

# ===== unpack results and sort back in order and return control to Task =====
for task_example_id, prediction_list in example_id_response_dict.items():
# ===== Unpack the request =====
prediction_list.sort(
key=lambda x: x.request_index
) # When we use Loglikelihood for several tokens we have all the options here
model_responses = [x.model_response for x in prediction_list]
cur_task_name = task_example_id.task_name.rsplit("|", 1)[0]

Expand Down
8 changes: 3 additions & 5 deletions src/lighteval/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,24 +116,22 @@ def apply_generative_metric(

def apply_multichoice_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[Metric]):
outputs = {}
if len(formatted_doc.choices) != len(results):
raise ValueError("Length of results is not equal to the length of the choices")
mc_results = results[: len(formatted_doc.choices)]
if len(formatted_doc.choices) <= 1:
raise ValueError(
"You can't use a multi choice metric with only one choice. Use `acc_golds_likelihood` instead."
)

# Todo: make better system with return_bool_score instead of taking first element
choices_logprob = [results[i].result[0] for i in range(len(formatted_doc.choices))] # sum(
choices_logprob = [mc_results[i].result[0] for i in range(len(formatted_doc.choices))] # sum(
gold_ixs = as_list(formatted_doc.gold_index)

for metric in metrics:
if metric.category == MetricCategory.MULTICHOICE:
outputs.update(
metric.compute(choices_logprob=choices_logprob, gold_ixs=gold_ixs, formatted_doc=formatted_doc)
)

return results, outputs
return results[len(formatted_doc.choices) :], outputs


def apply_multichoice_metric_one_token(results: list[ModelReturn], formatted_doc: Doc, metrics: list[Metric]):
Expand Down
37 changes: 26 additions & 11 deletions src/lighteval/metrics/harness_compatibility/drop.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,30 @@

import re
import string
from typing import List, Set, Tuple

import numpy as np
from scipy.optimize import linear_sum_assignment


def drop_metrics(predictions: list[str], formatted_doc, **kwargs): # noqa: C901
"""F1 score from bag of words: comes from Harness Drop
"""F1 score from bag of words: comes from Harness Drop. DROP offers two metrics,
a quasi exact match and a numeracy-focused F1 score. Quasi in the sense that it
does some normalizations before matching and numeracy-focused in the sense that
if there's number mismatch between the target and prediction F1 score is set to 0.
F1 score is computed using the intersection of target and prediction's BoW
representations with the additional spice that if the answer and/or prediction is
comprised of multiple spans, a greedy matching is done between the two sets of spans
(based on the very BoW overlap) and the average over F1 of pairs is returned.
DROP also accepts multiple answers in which case, the maximum of F1/ Exact Match
between prediction and the different answers is taken.
For more information, please refer to the section 5 of the DROP paper (https://aclanthology.org/N19-1246/).
Todo: this code is really hard to follow, simplify when possible
"""

def _answer_to_bags(answer):
def _answer_to_bags(answer: List[str]) -> Tuple[List[str], List[Set[str]]]:
if isinstance(answer, (list, tuple)):
raw_spans = answer
else:
Expand All @@ -45,23 +58,25 @@ def _answer_to_bags(answer):
token_bags.append(set(normalized_span.split()))
return normalized_spans, token_bags

def _get_metrics(predicted, gold):
def _get_metrics(predicted: List[str], gold: List[str]):
"""
Takes a predicted answer and a gold answer (that are both either a string or a list of
strings), and returns exact match and the DROP F1 metric for the prediction. If you are
writing a script for evaluating objects in memory (say, the output of predictions during
validation, or while training), this is the function you want to call, after using
:func:`answer_json_to_strings` when reading the gold answer from the released data file.
"""
predicted_bags = _answer_to_bags(predicted)
gold_bags = _answer_to_bags(gold)
pred_normalized_spans, pred_bags = _answer_to_bags(predicted)
gold_normalized_spans, gold_bags = _answer_to_bags(gold)

if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
if set(pred_normalized_spans) == set(gold_normalized_spans) and len(gold_normalized_spans) == len(
gold_normalized_spans
):
exact_match = 1.0
else:
exact_match = 0.0

f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
f1_per_bag = _align_bags(pred_bags, gold_bags)
f1 = np.mean(f1_per_bag)
f1 = round(f1, 2)
return exact_match, f1
Expand All @@ -73,7 +88,7 @@ def _is_number(text):
except ValueError:
return False

def _match_numbers_if_present(gold_bag, predicted_bag):
def _match_numbers_if_present(gold_bag: Set[str], predicted_bag: Set[str]):
gold_numbers = set()
predicted_numbers = set()
for word in gold_bag:
Expand All @@ -86,7 +101,7 @@ def _match_numbers_if_present(gold_bag, predicted_bag):
return True
return False

def _align_bags(predicted, gold):
def _align_bags(predicted: List[Set[str]], gold: List[Set[str]]) -> np.array:
"""
Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
between them and gets maximum metric values over all the answers.
Expand Down Expand Up @@ -136,7 +151,7 @@ def _fix_number(text):
def _tokenize(text):
return re.split(" |-", text)

def _normalize(answer):
def _normalize(answer: str):
tokens = [
_white_space_fix(_remove_articles(_fix_number(_remove_punc(token.lower())))) for token in _tokenize(answer)
]
Expand All @@ -147,9 +162,9 @@ def _normalize(answer):
max_em = 0
max_f1 = 0
for gold_answer in formatted_doc.specific["golds_no_preprocessing"]:
exact_match, f1_score = _get_metrics(predictions, gold_answer)
if isinstance(gold_answer, list):
gold_answer = gold_answer[0]
exact_match, f1_score = _get_metrics(predictions, gold_answer)
if gold_answer.strip():
max_em = max(max_em, exact_match)
max_f1 = max(max_f1, f1_score)
Expand Down
8 changes: 2 additions & 6 deletions src/lighteval/models/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

from lighteval.data import GenerativeTaskDataset, LoglikelihoodDataset, LoglikelihoodSingleTokenDataset
from lighteval.logging.hierarchical_logger import hlog, hlog_err, hlog_warn
Expand Down Expand Up @@ -88,9 +88,7 @@ def __init__(
self.multichoice_continuations_start_space = config.multichoice_continuations_start_space

# We are in DP (and launch the script with `accelerate launch`)
if not config.model_parallel and config.quantization_config is None:
# might need to use accelerate instead
# self.model = config.accelerator.prepare(self.model)
if not config.model_parallel and not isinstance(config.quantization_config, BitsAndBytesConfig):
hlog(f"Using Data Parallelism, putting model on device {self._device}")
self.model = self.model.to(self._device)

Expand Down Expand Up @@ -267,8 +265,6 @@ def _init_max_length(self, max_length) -> int:
if hasattr(self._config, attr):
return getattr(self._config, attr)

if hasattr(self.tokenizer, "model_max_length"):
return self.tokenizer.model_max_length
# Default max sequence length setting for when no `max_length` is provided
# or no max length config setting is found in the model or tokenizer.
return 2048
Expand Down
33 changes: 25 additions & 8 deletions src/lighteval/models/model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ class BaseModelConfig:
Use `dtype="auto"` to derive the type from the model's weights.
device (Union[int, str]): device to use for model training.
quantization_config (Optional[BitsAndBytesConfig]): quantization
configuration for the model. Needed for 4-bit and 8-bit precision.
configuration for the model, manually provided to load a normally floating point
model at a quantized precision. Needed for 4-bit and 8-bit precision.
trust_remote_code (bool): Whether to trust remote code during model
loading.
Expand Down Expand Up @@ -144,13 +145,29 @@ def _init_configs(self, model_name: str, env_config: EnvConfig) -> PretrainedCon
cache_dir=env_config.cache_dir,
token=env_config.token,
)
if getattr(auto_config, "quantization_config", False) and self.quantization_config is None:
if not is_autogptq_available():
raise ImportError(NO_AUTOGPTQ_ERROR_MSG)
hlog(
"`quantization_config` is None but was found in the model's config, using the one found in config.json"
)
self.quantization_config = GPTQConfig(**auto_config.quantization_config, disable_exllama=True)

# Gathering the model's automatic quantization config, if available
try:
model_auto_quantization_config = auto_config.quantization_config
hlog("An automatic quantization config was found in the model's config. Using it to load the model")
except (AttributeError, KeyError):
model_auto_quantization_config = None

if model_auto_quantization_config is not None:
if self.quantization_config is not None:
# We don't load models quantized by default with a different user provided conf
raise ValueError("You manually requested quantization on a model already quantized!")

# We add the quantization to the model params we store
if model_auto_quantization_config["quant_method"] == "gptq":
if not is_autogptq_available():
raise ImportError(NO_AUTOGPTQ_ERROR_MSG)
auto_config.quantization_config["use_exllama"] = None
self.quantization_config = GPTQConfig(**auto_config.quantization_config, disable_exllama=True)
elif model_auto_quantization_config["quant_method"] == "bitsandbytes":
if not is_bnb_available():
raise ImportError(NO_BNB_ERROR_MSG)
self.quantization_config = BitsAndBytesConfig(**auto_config.quantization_config)

return auto_config

Expand Down
22 changes: 11 additions & 11 deletions src/lighteval/tasks/lighteval_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,16 @@ def process_results(self, formatted_doc: Doc, results: list[ModelReturn]) -> dic
results=results, formatted_doc=formatted_doc, metrics=self.metrics
)
outputs.update(cur_outputs)
if self.has_metric_category[MetricCategory.MULTICHOICE]:
results, cur_outputs = apply_multichoice_metric(
results=results, formatted_doc=formatted_doc, metrics=self.metrics
)
outputs.update(cur_outputs)
if self.has_metric_category[MetricCategory.MULTICHOICE_ONE_TOKEN]:
results, cur_outputs = apply_multichoice_metric_one_token(
results=results, formatted_doc=formatted_doc, metrics=self.metrics
)
outputs.update(cur_outputs)
if self.has_metric_category[MetricCategory.PERPLEXITY]:
results, cur_outputs = apply_perplexity_metric(
results=results, formatted_doc=formatted_doc, metrics=self.metrics
Expand All @@ -585,16 +595,6 @@ def process_results(self, formatted_doc: Doc, results: list[ModelReturn]) -> dic
max_num_samples=max(self.num_samples),
)
outputs.update(cur_outputs)
if self.has_metric_category[MetricCategory.MULTICHOICE]:
results, cur_outputs = apply_multichoice_metric(
results=results, formatted_doc=formatted_doc, metrics=self.metrics
)
outputs.update(cur_outputs)
if self.has_metric_category[MetricCategory.MULTICHOICE_ONE_TOKEN]:
results, cur_outputs = apply_multichoice_metric_one_token(
results=results, formatted_doc=formatted_doc, metrics=self.metrics
)
outputs.update(cur_outputs)
if (
self.has_metric_category[MetricCategory.LLM_AS_JUDGE_MULTI_TURN]
or self.has_metric_category[MetricCategory.LLM_AS_JUDGE]
Expand Down Expand Up @@ -671,7 +671,7 @@ def create_requests_from_tasks( # noqa: C901
) -> Tuple[dict[RequestType, list[Request]], dict[TaskExampleId, Doc]]:
"""
Takes a task dict and a fewshot dict and returns a dict of requests, a dict
of docs, and a dict of requests origins. The construction of prompts and
of docs, and a dict of requests origins. The construction of prompts and
thus the managing of few shots is done here.
Args:
Expand Down

0 comments on commit 8038c8c

Please sign in to comment.