Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add mt-bench #75

Merged
merged 51 commits into from
Mar 29, 2024
Merged
Changes from 1 commit
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
89e7fda
init ifeval, now need to add loading custom metric system
clefourrier Feb 20, 2024
96aa81b
Merge branch 'main' into clem_customizable_metrics
clefourrier Feb 23, 2024
2fdceb8
custom metrics working! need to update the readme
clefourrier Feb 23, 2024
0e30b21
update doc
clefourrier Feb 23, 2024
1ba178f
fix eos token + eval script
clefourrier Feb 23, 2024
6233af7
init
Feb 28, 2024
5cc9c2c
remove ifeval
Feb 28, 2024
b9045e1
revert README
Feb 28, 2024
ff79480
revert README
Feb 28, 2024
a234bf6
better context management
Feb 28, 2024
1357c10
working state
NathanHB Mar 6, 2024
bb5cca2
fix
NathanHB Mar 6, 2024
6b74a68
:Merge branch 'nathan_fix_push_details' into nathan-add-mt-bench
NathanHB Mar 6, 2024
f548902
continue
NathanHB Mar 9, 2024
2e2b15d
continue
NathanHB Mar 11, 2024
339f1f6
commit
NathanHB Mar 20, 2024
aba90b3
:Merge remote-tracking branch 'origin/main' into nathan-add-mt-bench
NathanHB Mar 20, 2024
5bc5b98
Update README.md
NathanHB Mar 20, 2024
cd1300d
commit
NathanHB Mar 20, 2024
1fd755e
commit
NathanHB Mar 20, 2024
4b00eb7
commit
NathanHB Mar 20, 2024
4903755
commit
NathanHB Mar 20, 2024
9ff0707
commit
NathanHB Mar 20, 2024
ff177a1
commit
NathanHB Mar 20, 2024
9794b7c
commit
NathanHB Mar 20, 2024
6268ff6
commit
NathanHB Mar 20, 2024
31eaab1
commit
NathanHB Mar 20, 2024
c80ef8c
commit
NathanHB Mar 21, 2024
c296b63
Revert "commit"
NathanHB Mar 21, 2024
804f41a
commit
NathanHB Mar 21, 2024
48b0fee
remove model adapter
NathanHB Mar 21, 2024
e5b6ea8
commit
NathanHB Mar 21, 2024
0dcdb1e
update readme
NathanHB Mar 21, 2024
703741b
commti
NathanHB Mar 21, 2024
6e8026f
commit
NathanHB Mar 22, 2024
588fb2f
format
NathanHB Mar 22, 2024
8cb4894
format
NathanHB Mar 22, 2024
c08a8f6
commit
NathanHB Mar 25, 2024
64ceee5
fixes for review
NathanHB Mar 27, 2024
46d7dd8
make style
NathanHB Mar 27, 2024
e2f7fa8
fix
NathanHB Mar 27, 2024
3260147
revert generate_response in base model
NathanHB Mar 27, 2024
323188a
Merge remote-tracking branch 'origin/main' into nathan-add-mt-bench
NathanHB Mar 27, 2024
33eb252
merge
NathanHB Mar 27, 2024
b2e5895
fix tests
NathanHB Mar 27, 2024
c42e65d
fix format
NathanHB Mar 27, 2024
aa6c6f8
commit
NathanHB Mar 29, 2024
bb4b133
make style
NathanHB Mar 29, 2024
2d3a04c
fix from review
NathanHB Mar 29, 2024
0819ac7
fix
NathanHB Mar 29, 2024
b2bf514
Merge branch 'main' into nathan-add-mt-bench
NathanHB Mar 29, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
format
  • Loading branch information
NathanHB committed Mar 22, 2024
commit 588fb2f9bb2640fa775786109e3984a64d20721f
23 changes: 23 additions & 0 deletions extended_tasks/mt_bench/judges.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,26 @@
# MIT License
NathanHB marked this conversation as resolved.
Show resolved Hide resolved

# Copyright (c) 2024 The HuggingFace Team

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


import ast
import json
import re
23 changes: 23 additions & 0 deletions extended_tasks/mt_bench/main.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,26 @@
# MIT License

# Copyright (c) 2024 The HuggingFace Team
NathanHB marked this conversation as resolved.
Show resolved Hide resolved

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


# ruff: noqa: F405, F403, F401
"""
Custom evaluation tasks for lighteval. Copy this file and complete it with the info for your task.
4 changes: 3 additions & 1 deletion src/lighteval/evaluator.py
Original file line number Diff line number Diff line change
@@ -130,7 +130,9 @@ def evaluate( # noqa: C901
judgement = None

evaluation_tracker.metrics_logger.log(task_example_id.task_name, metrics)
evaluation_tracker.details_logger.log(task_example_id.task_name, task, doc, model_responses, metrics, (user_prompt, judgement))
evaluation_tracker.details_logger.log(
task_example_id.task_name, task, doc, model_responses, metrics, (user_prompt, judgement)
)

return evaluation_tracker

10 changes: 9 additions & 1 deletion src/lighteval/logging/info_loggers.py
Original file line number Diff line number Diff line change
@@ -305,7 +305,15 @@ class CompiledHash:
compiled_details: dict[str, CompiledDetail] = collections.defaultdict(CompiledDetail)
compiled_details_over_all_tasks: CompiledDetailOverAllTasks = CompiledDetailOverAllTasks()

def log(self, task_name: str, task: LightevalTask, doc: Doc, outputs: list[ModelReturn], metrics: dict, llm_as_prompt_judgement: tuple[str, str]) -> None:
def log(
self,
task_name: str,
task: LightevalTask,
doc: Doc,
outputs: list[ModelReturn],
metrics: dict,
llm_as_prompt_judgement: tuple[str, str],
NathanHB marked this conversation as resolved.
Show resolved Hide resolved
NathanHB marked this conversation as resolved.
Show resolved Hide resolved
) -> None:
"""Stores the relevant information for one sample of one task to the total list of samples stored in the DetailsLogger.

Args:
1 change: 1 addition & 0 deletions src/lighteval/metrics/__init__.py
Original file line number Diff line number Diff line change
@@ -147,6 +147,7 @@ def apply_multichoice_metric_one_token(results: list[ModelReturn], formatted_doc

return results, outputs


def apply_generative_multi_turn_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[str]):
outputs = {}
predictions = results.pop(0).result
14 changes: 12 additions & 2 deletions src/lighteval/models/base_model.py
Original file line number Diff line number Diff line change
@@ -352,7 +352,9 @@ def greedy_until_with_logits(
override_bs=override_bs,
)

def greedy_until_multi_turn(self, requests: list[GreedyUntilMultiTurnRequest], override_bs: Optional[int] = None) -> GenerateMultiTurnReturn:
def greedy_until_multi_turn(
NathanHB marked this conversation as resolved.
Show resolved Hide resolved
self, requests: list[GreedyUntilMultiTurnRequest], override_bs: Optional[int] = None
) -> GenerateMultiTurnReturn:
for request in requests:
request.stop_sequence = as_list(request.stop_sequence) + [self.tokenizer.eos_token]
request.tokenized_context = self.tok_encode(request.context)
@@ -429,7 +431,15 @@ def greedy_until_multi_turn(self, requests: list[GreedyUntilMultiTurnRequest], o

model_answers.append(cur_reponses[0].result)

results.append(GenerateMultiTurnReturn(result=model_answers, input_tokens=[], generated_tokens=[], truncated_tokens_count=0, padded_tokens_count=0))
results.append(
GenerateMultiTurnReturn(
result=model_answers,
input_tokens=[],
NathanHB marked this conversation as resolved.
Show resolved Hide resolved
generated_tokens=[],
truncated_tokens_count=0,
padded_tokens_count=0,
)
)

return results

1 change: 1 addition & 0 deletions src/lighteval/models/model_output.py
Original file line number Diff line number Diff line change
@@ -65,6 +65,7 @@ class GenerateReturn(ModelReturn):
def get_result_for_eval(self):
return self.result if self.logits is None else (self.result, self.logits)


@dataclass
class GenerateMultiTurnReturn(ModelReturn):
result: list[str] = field(default_factory=list)
2 changes: 2 additions & 0 deletions src/lighteval/tasks/requests.py
Original file line number Diff line number Diff line change
@@ -120,6 +120,7 @@ class GreedyUntilRequest(Request):
request_type = RequestType.GREEDY_UNTIL
tokenized_context: list[int] = None


@dataclass
class GreedyUntilMultiTurnRequest(Request):
"""
@@ -130,6 +131,7 @@ class GreedyUntilMultiTurnRequest(Request):
generation_size (int): The maximum number of tokens to generate.
request_type (RequestType): The type of the request, set to RequestType.GREEDY_UNTIL.
"""

stop_sequence: str
generation_size: int
request_type = RequestType.GREEDY_UNTIL_MULTI_TURN
Loading