Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
89e7fda
init ifeval, now need to add loading custom metric system
clefourrier Feb 20, 2024
96aa81b
Merge branch 'main' into clem_customizable_metrics
clefourrier Feb 23, 2024
2fdceb8
custom metrics working! need to update the readme
clefourrier Feb 23, 2024
0e30b21
update doc
clefourrier Feb 23, 2024
1ba178f
fix eos token + eval script
clefourrier Feb 23, 2024
6233af7
init
Feb 28, 2024
5cc9c2c
remove ifeval
Feb 28, 2024
b9045e1
revert README
Feb 28, 2024
ff79480
revert README
Feb 28, 2024
a234bf6
better context management
Feb 28, 2024
1357c10
working state
NathanHB Mar 6, 2024
bb5cca2
fix
NathanHB Mar 6, 2024
6b74a68
:Merge branch 'nathan_fix_push_details' into nathan-add-mt-bench
NathanHB Mar 6, 2024
f548902
continue
NathanHB Mar 9, 2024
2e2b15d
continue
NathanHB Mar 11, 2024
339f1f6
commit
NathanHB Mar 20, 2024
aba90b3
:Merge remote-tracking branch 'origin/main' into nathan-add-mt-bench
NathanHB Mar 20, 2024
5bc5b98
Update README.md
NathanHB Mar 20, 2024
cd1300d
commit
NathanHB Mar 20, 2024
1fd755e
commit
NathanHB Mar 20, 2024
4b00eb7
commit
NathanHB Mar 20, 2024
4903755
commit
NathanHB Mar 20, 2024
9ff0707
commit
NathanHB Mar 20, 2024
ff177a1
commit
NathanHB Mar 20, 2024
9794b7c
commit
NathanHB Mar 20, 2024
6268ff6
commit
NathanHB Mar 20, 2024
31eaab1
commit
NathanHB Mar 20, 2024
c80ef8c
commit
NathanHB Mar 21, 2024
c296b63
Revert "commit"
NathanHB Mar 21, 2024
804f41a
commit
NathanHB Mar 21, 2024
48b0fee
remove model adapter
NathanHB Mar 21, 2024
e5b6ea8
commit
NathanHB Mar 21, 2024
0dcdb1e
update readme
NathanHB Mar 21, 2024
703741b
commti
NathanHB Mar 21, 2024
6e8026f
commit
NathanHB Mar 22, 2024
588fb2f
format
NathanHB Mar 22, 2024
8cb4894
format
NathanHB Mar 22, 2024
c08a8f6
commit
NathanHB Mar 25, 2024
64ceee5
fixes for review
NathanHB Mar 27, 2024
46d7dd8
make style
NathanHB Mar 27, 2024
e2f7fa8
fix
NathanHB Mar 27, 2024
3260147
revert generate_response in base model
NathanHB Mar 27, 2024
323188a
Merge remote-tracking branch 'origin/main' into nathan-add-mt-bench
NathanHB Mar 27, 2024
33eb252
merge
NathanHB Mar 27, 2024
b2e5895
fix tests
NathanHB Mar 27, 2024
c42e65d
fix format
NathanHB Mar 27, 2024
aa6c6f8
commit
NathanHB Mar 29, 2024
bb4b133
make style
NathanHB Mar 29, 2024
2d3a04c
fix from review
NathanHB Mar 29, 2024
0819ac7
fix
NathanHB Mar 29, 2024
b2bf514
Merge branch 'main' into nathan-add-mt-bench
NathanHB Mar 29, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@ quality = ["ruff==v0.2.2","pre-commit"]
tests = ["pytest==7.4.0"]
dev = ["lighteval[accelerate,quality,tests]"]
extended_tasks = [
"langdetect", #ifeval
"langdetect", # ifeval
"openai", # mt-bench
]

[project.urls]
Expand Down
18 changes: 17 additions & 1 deletion src/lighteval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ def evaluate( # noqa: C901
full_resps = lm.greedy_until_with_logits(requests, override_bs=override_bs)
elif request_type == RequestType.LOGLIKELIHOOD_ROLLING:
full_resps = lm.loglikelihood_rolling(requests, override_bs=override_bs)
elif request_type == RequestType.GREEDY_UNTIL_MULTI_TURN:
full_resps = lm.greedy_until_multi_turn(requests, override_bs=override_bs)
else:
raise NotImplementedError(f"Request type {request_type} not supported")

Expand Down Expand Up @@ -115,8 +117,22 @@ def evaluate( # noqa: C901
# using a deep copy here because process results pops from the model responses
metrics = task.process_results(doc, copy.deepcopy(model_responses))

# Remove the user_prompt from the metrics in case of llm-as-judge metric
if "user_prompt" in metrics:
user_prompt = metrics["user_prompt"]
del metrics["user_prompt"]
else:
user_prompt = None
if "judgement" in metrics:
judgement = metrics["judgement"]
del metrics["judgement"]
else:
judgement = None

evaluation_tracker.metrics_logger.log(task_example_id.task_name, metrics)
evaluation_tracker.details_logger.log(task_example_id.task_name, task, doc, model_responses, metrics)
evaluation_tracker.details_logger.log(
task_example_id.task_name, task, doc, model_responses, metrics, (user_prompt, judgement)
)

return evaluation_tracker

Expand Down
42 changes: 41 additions & 1 deletion src/lighteval/few_shot_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from itertools import cycle
from typing import TYPE_CHECKING, Optional

from transformers import AutoTokenizer
from transformers import AutoTokenizer, PreTrainedTokenizer

from lighteval.logging.hierarchical_logger import hlog_warn
from lighteval.tasks.requests import Doc
Expand Down Expand Up @@ -219,6 +219,46 @@ def get_examples(
)
return instruction + labeled_examples + example

def create_multi_turn_contexts(
self, doc: Doc, use_chat_template: bool, system_prompt: Optional[str], tokenizer: PreTrainedTokenizer
) -> list[str]:
"""Creates N contexts (depending on the number of turn) for a tasks.
Multi turn tasks need use chat templating.

Args:
doc (Doc): Formated document.
use_chat_template (bool): wether or not to use chat template. Will fail if false.
system_prompt (Optional[str]): The system prompt to use
tokenizer (PreTrainedTokenizer): The tokenizer used for the chat template

Raises:
ValueError: If use_chat_template is set to false.

Returns:
list[str]: contexts for every turn
"""
if not use_chat_template:
raise ValueError("You need to use the chat template to create multi turn contexts")

role_content_list = []
if system_prompt is not None:
role_content_list.append({"role": "system", "content": system_prompt})

for i in doc.specific["multi_turn_queries"]:
role_content_list.append({"role": "user", "content": i})
role_content_list.append({"role": "assistant", "content": "{model_response}"})
role_content_list.pop(-1)

contexts = []
offset = 2 if system_prompt is not None else 1
for i in range(0, len(role_content_list), offset + 1):
c = tokenizer.apply_chat_template(
role_content_list[: i + offset], add_generation_prompt=True, tokenize=False, add_special_tokens=False
)
contexts.append(c)

return contexts, 0

def fewshot_context(
self,
task: "LightevalTask",
Expand Down
24 changes: 21 additions & 3 deletions src/lighteval/logging/info_loggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import os
import time
from dataclasses import asdict, dataclass, field
from typing import Union
from typing import Optional, Union

import git
import numpy as np
Expand Down Expand Up @@ -205,6 +205,9 @@ class Detail:
choices: list = field(default_factory=list)
gold_index: list = field(default_factory=list)
metrics: dict = field(default_factory=dict)
judement_prompt: str = None
judgement: str = None
specifics: dict = field(default_factory=dict)

@dataclass
class CompiledDetail:
Expand Down Expand Up @@ -302,7 +305,15 @@ class CompiledHash:
compiled_details: dict[str, CompiledDetail] = collections.defaultdict(CompiledDetail)
compiled_details_over_all_tasks: CompiledDetailOverAllTasks = CompiledDetailOverAllTasks()

def log(self, task_name: str, task: LightevalTask, doc: Doc, outputs: list[ModelReturn], metrics: dict) -> None:
def log(
self,
task_name: str,
task: LightevalTask,
doc: Doc,
outputs: list[ModelReturn],
metrics: dict,
llm_as_prompt_judgement: Optional[tuple[str, str]] = None,
) -> None:
"""Stores the relevant information for one sample of one task to the total list of samples stored in the DetailsLogger.

Args:
Expand All @@ -311,6 +322,8 @@ def log(self, task_name: str, task: LightevalTask, doc: Doc, outputs: list[Model
doc (Doc): Current sample that we want to store.
outputs (list[ModelReturn]): Model outputs for the current sample
metrics (_type_): Model scores for said sample on the current task's metrics.
llm_as_prompt_judgement (tuple[str, str]): Tuple containing the
prompt passed to the judge and the judgement for the current sample when using llm-as-judge metric.
"""
detail = self.Detail()
detail.example = doc.query
Expand Down Expand Up @@ -354,6 +367,11 @@ def log(self, task_name: str, task: LightevalTask, doc: Doc, outputs: list[Model
detail.choices = doc.choices
detail.gold_index = as_list(doc.gold_index)
pred_saved = True
if task.has_metric_category[MetricCategory.GENERATIVE_MULTI_TURN]:
pred_saved = True
detail.judement_prompt = llm_as_prompt_judgement[0]
detail.judgement = llm_as_prompt_judgement[1]
detail.specifics = doc.specific
if not pred_saved:
raise NotImplementedError(
"No metric prediction saved."
Expand All @@ -364,7 +382,7 @@ def log(self, task_name: str, task: LightevalTask, doc: Doc, outputs: list[Model

hash = self.Hash()
hash.example = xxhash.xxh64(doc.query).hexdigest()
hash.full_prompt = xxhash.xxh64(doc.ctx).hexdigest()
hash.full_prompt = xxhash.xxh64(str(doc.ctx)).hexdigest()
hash.input_tokens = xxhash.xxh64(str([o.input_tokens for o in outputs])).hexdigest()
hash.cont_tokens = xxhash.xxh64(str([o.generated_tokens for o in outputs])).hexdigest()
self.hashes[task_name].append(hash)
Expand Down
11 changes: 11 additions & 0 deletions src/lighteval/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,14 @@ def apply_multichoice_metric_one_token(results: list[ModelReturn], formatted_doc
)

return results, outputs


def apply_generative_multi_turn_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[str]):
outputs = {}
predictions = results.pop(0).result

for metric in metrics:
if Metrics[metric].value.category == MetricCategory.GENERATIVE_MULTI_TURN:
outputs.update(Metrics[metric].value.compute(predictions=predictions, formatted_doc=formatted_doc))

return results, outputs
1 change: 1 addition & 0 deletions src/lighteval/metrics/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class MetricCategory(Enum):
TARGET_PERPLEXITY = auto()
PERPLEXITY = auto()
GENERATIVE = auto()
GENERATIVE_MULTI_TURN = auto()
GENERATIVE_LOGPROB = auto()
MULTICHOICE = auto()
MULTICHOICE_ONE_TOKEN = auto()
Expand Down
14 changes: 13 additions & 1 deletion src/lighteval/models/abstract_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,14 @@
from transformers import BatchEncoding

from lighteval.models.model_config import EnvConfig
from lighteval.models.model_output import GenerateReturn, LoglikelihoodReturn, LoglikelihoodSingleTokenReturn
from lighteval.models.model_output import (
GenerateMultiTurnReturn,
GenerateReturn,
LoglikelihoodReturn,
LoglikelihoodSingleTokenReturn,
)
from lighteval.tasks.requests import (
GreedyUntilMultiTurnRequest,
GreedyUntilRequest,
GreedyUntilWithLogitsRequest,
LoglikelihoodRequest,
Expand Down Expand Up @@ -102,6 +108,12 @@ def greedy_until_with_logits(
returns_logits=True,
)

def greedy_until_multi_turn( # noqa: C901
self, requests: list[GreedyUntilMultiTurnRequest], override_bs: Optional[int] = None
) -> GenerateMultiTurnReturn:
"""Generates responses using a greedy decoding strategy until certain ending conditions are met."""
return NotImplemented

@abstractmethod
def greedy_until(
self,
Expand Down
Loading