diff --git a/docs/sections/pipeline_samples/examples/index.md b/docs/sections/pipeline_samples/examples/index.md index 19b2136278..ffcadb3199 100644 --- a/docs/sections/pipeline_samples/examples/index.md +++ b/docs/sections/pipeline_samples/examples/index.md @@ -60,3 +60,19 @@ Answer instructions with knowledge graphs defined as `pydantic.BaseModel` object ``` ![Knowledge graph figure](../../../assets/images/sections/examples/knowledge-graph-example.png) + + +### [Benchmarking with `distilabel`: Arena Hard](#benchmarking-with-distilabel-arena-hard) + +Benchmark LLMs with `distilabel`: reproducing the Arena Hard benchmark. + +??? Example "See example" + + The script below first defines both the `ArenaHard` and the `ArenaHardResults` tasks, so as to generate responses for a given collection of prompts/questions with up to two LLMs, and then calculate the results as per the original implementation, respectively. Additionally, the second part of the example builds a `Pipeline` to run the generation on top of the prompts with `InferenceEndpointsLLM` while streaming the rest of the generations from a pre-computed set of GPT-4 generations, and then evaluate one against the other with `OpenAILLM` generating an alternate response, a comparison between the responses, and a result as A>>B, A>B, B>A, B>>A, or tie. + + To run this example you will first need to install the Arena Hard optional dependencies, being `pandas`, `scikit-learn`, and `numpy`. + + ```python title="arena_hard.py" + --8<-- "examples/arena_hard.py" + ``` + diff --git a/src/distilabel/steps/tasks/benchmarks/arena_hard.py b/examples/arena_hard.py similarity index 77% rename from src/distilabel/steps/tasks/benchmarks/arena_hard.py rename to examples/arena_hard.py index 78cd7fa175..81bec55ace 100644 --- a/src/distilabel/steps/tasks/benchmarks/arena_hard.py +++ b/examples/arena_hard.py @@ -15,12 +15,11 @@ import re from typing import Any, Dict, List, Optional, Union -from typing_extensions import override - from distilabel.steps import GlobalStep, StepInput from distilabel.steps.tasks.base import Task from distilabel.steps.tasks.typing import ChatType from distilabel.steps.typing import StepOutput +from typing_extensions import override class ArenaHard(Task): @@ -326,3 +325,134 @@ def process(self, inputs: StepInput) -> StepOutput: # type: ignore # Here only so that if follow up steps are connected the inputs are preserved, # since this step doesn't modify nor generate new inputs yield inputs + + +if __name__ == "__main__": + import json + + from distilabel.llms import InferenceEndpointsLLM, OpenAILLM + from distilabel.pipeline import Pipeline + from distilabel.steps import ( + CombineColumns, + KeepColumns, + LoadHubDataset, + StepInput, + step, + ) + from distilabel.steps.tasks import TextGeneration + from distilabel.steps.typing import StepOutput + + @step(inputs=["turns"], outputs=["system_prompt", "instruction"]) + def PrepareForTextGeneration(*inputs: StepInput) -> StepOutput: + for input in inputs: + for item in input: + item["system_prompt"] = "You are a helpful assistant." + item["instruction"] = item["turns"][0]["content"] + yield input + + @step( + inputs=["question_id"], + outputs=["generation", "generation_model"], + step_type="global", + ) + def LoadReference(*inputs: StepInput) -> StepOutput: + # File downloaded from https://raw.githubusercontent.com/lm-sys/arena-hard-auto/e0a8ea1df42c1df76451a6cd04b14e31ff992b87/data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl + lines = open("gpt-4-0314.jsonl", mode="r").readlines() + for input in inputs: + for item in input: + for line in lines: + data = json.loads(line) + if data["question_id"] == item["question_id"]: + item["generation"] = data["choices"][0]["turns"][0]["content"] + item["generation_model"] = data["model_id"] + break + yield input + + with Pipeline(name="arena-hard-v0.1") as pipeline: + load_dataset = LoadHubDataset( + name="load_dataset", + repo_id="alvarobartt/lmsys-arena-hard-v0.1", + split="test", + num_examples=5, + ) + + load_reference = LoadReference(name="load_reference") + + prepare = PrepareForTextGeneration(name="prepare") + + text_generation_cohere = TextGeneration( + name="text_generation_cohere", + llm=InferenceEndpointsLLM( + model_id="CohereForAI/c4ai-command-r-plus", + tokenizer_id="CohereForAI/c4ai-command-r-plus", + ), + use_system_prompt=True, + input_batch_size=10, + output_mappings={"model_name": "generation_model"}, + ) + + combine_columns = CombineColumns( + name="combine_columns", + columns=["generation", "generation_model"], + output_columns=["generations", "generation_models"], + ) + + arena_hard = ArenaHard( + name="arena_hard", + llm=OpenAILLM(model="gpt-4-1106-preview"), + output_mappings={"model_name": "evaluation_model"}, + ) + + keep_columns = KeepColumns( + name="keep_columns", + columns=[ + "question_id", + "category", + "cluster", + "system_prompt", + "instruction", + "generations", + "generation_models", + "evaluation", + "score", + "evaluation_model", + ], + ) + + win_rates = ArenaHardResults( + name="win_rates", custom_model_column="generation_models" + ) + + load_dataset >> load_reference # type: ignore + load_dataset >> prepare >> text_generation_cohere # type: ignore + ( # type: ignore + [load_reference, text_generation_cohere] + >> combine_columns + >> arena_hard + >> keep_columns + >> win_rates + ) + + distiset = pipeline.run( + parameters={ # type: ignore + text_generation_cohere.name: { + "llm": { + "generation_kwargs": { + "temperature": 0.7, + "max_new_tokens": 4096, + "stop_sequences": ["", "<|END_OF_TURN_TOKEN|>"], + } + } + }, + arena_hard.name: { + "llm": { + "generation_kwargs": { + "temperature": 0.0, + "max_new_tokens": 4096, + } + } + }, + }, + ) + if distiset is not None: + distiset.push_to_hub("arena-hard-results") diff --git a/pyproject.toml b/pyproject.toml index 50c858462c..b94387ff51 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -87,9 +87,6 @@ outlines = ["outlines >= 0.0.40"] vertexai = ["google-cloud-aiplatform >= 1.38.0"] vllm = ["vllm >= 0.4.0", "outlines == 0.0.34", "filelock >= 3.13.4"] -# Other optional dependencies -arena-hard = ["pandas", "numpy", "scikit-learn"] - [project.urls] Documentation = "https://distilabel.argilla.io/" Issues = "https://github.com/argilla/distilabel/issues" diff --git a/scripts/install_dependencies.sh b/scripts/install_dependencies.sh index 3372bd60fd..4da6ad9dd4 100755 --- a/scripts/install_dependencies.sh +++ b/scripts/install_dependencies.sh @@ -6,7 +6,7 @@ python_version=$(python -c "import sys; print(sys.version_info[:2])") python -m pip install uv -uv pip install --system -e ".[dev,tests,anthropic,arena-hard,argilla,cohere,groq,hf-inference-endpoints,hf-transformers,litellm,llama-cpp,ollama,openai,outlines,vertexai]" +uv pip install --system -e ".[dev,tests,anthropic,argilla,cohere,groq,hf-inference-endpoints,hf-transformers,litellm,llama-cpp,ollama,openai,outlines,vertexai]" if [ "${python_version}" != "(3, 8)" ]; then uv pip install --system -e .[mistralai,instructor] fi diff --git a/src/distilabel/steps/tasks/__init__.py b/src/distilabel/steps/tasks/__init__.py index d1bccf08f2..b2456d7824 100644 --- a/src/distilabel/steps/tasks/__init__.py +++ b/src/distilabel/steps/tasks/__init__.py @@ -13,7 +13,6 @@ # limitations under the License. from distilabel.steps.tasks.base import GeneratorTask, Task -from distilabel.steps.tasks.benchmarks.arena_hard import ArenaHard, ArenaHardResults from distilabel.steps.tasks.complexity_scorer import ComplexityScorer from distilabel.steps.tasks.evol_instruct.base import EvolInstruct from distilabel.steps.tasks.evol_instruct.evol_complexity.base import EvolComplexity @@ -47,8 +46,6 @@ from distilabel.steps.tasks.ultrafeedback import UltraFeedback __all__ = [ - "ArenaHard", - "ArenaHardResults", "GeneratorTask", "Task", "ComplexityScorer", diff --git a/src/distilabel/steps/tasks/benchmarks/__init__.py b/src/distilabel/steps/tasks/benchmarks/__init__.py deleted file mode 100644 index 2598794f29..0000000000 --- a/src/distilabel/steps/tasks/benchmarks/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2023-present, Argilla, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tests/unit/steps/tasks/benchmarks/__init__.py b/tests/unit/steps/tasks/benchmarks/__init__.py deleted file mode 100644 index 2598794f29..0000000000 --- a/tests/unit/steps/tasks/benchmarks/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2023-present, Argilla, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tests/unit/steps/tasks/benchmarks/test_arena_hard.py b/tests/unit/steps/tasks/benchmarks/test_arena_hard.py deleted file mode 100644 index 40666e4402..0000000000 --- a/tests/unit/steps/tasks/benchmarks/test_arena_hard.py +++ /dev/null @@ -1,172 +0,0 @@ -# Copyright 2023-present, Argilla, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from typing import Any, Dict, List, Union - -import pytest -from _pytest.logging import LogCaptureFixture -from distilabel.pipeline.local import Pipeline -from distilabel.steps.tasks.benchmarks.arena_hard import ArenaHard, ArenaHardResults - -from tests.unit.conftest import DummyLLM - - -class TestArenaHard: - def test_format_input(self) -> None: - task = ArenaHard( - name="arena_hard", - llm=DummyLLM(), - pipeline=Pipeline(name="unit-test-pipeline"), - ) - task.load() - - result = task.format_input( - input={ - "instruction": "INSTRUCTION", - "generations": ["GENERATION_A", "GENERATION_B"], - } - ) - - assert result[-1] == { - "role": "user", - "content": "<|User Prompt|>\nINSTRUCTION\n\n<|The Start of Assistant A's Answer|>\nGENERATION_A\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\nGENERATION_B\n<|The End of Assistant B's Answer|>", - } - - @pytest.mark.parametrize( - "output, expected", - [ - ( - "My own answer to the prompt would be:\nANSWER\nMy final veredict is: [[A>>B]]\n", - { - "evaluation": "My own answer to the prompt would be:\nANSWER\nMy final veredict is: [[A>>B]]\n", - "score": "A>>B", - }, - ), - ( - "My own answer to the prompt would be:\nANSWER\nMy final veredict is: TIE\n", - { - "evaluation": "My own answer to the prompt would be:\nANSWER\nMy final veredict is: TIE\n", - "score": None, - }, - ), - ( - None, - {"evaluation": None, "score": None}, - ), - ], - ) - def test_format_output( - self, output: Union[str, None], expected: Dict[str, Any] - ) -> None: - task = ArenaHard( - name="arena_hard", - llm=DummyLLM(), - pipeline=Pipeline(name="unit-test-pipeline"), - ) - task.load() - - assert ( - task.format_output( - output=output, - input={ - "instruction": "INSTRUCTION", - "generations": ["GENERATION_A", "GENERATION_B"], - }, - ) - == expected - ) - - -class TestArenaHardResults: - @pytest.mark.parametrize( - "custom_model_column, inputs", - [ - ("model_name", ["evaluation", "score", "model_name"]), - (None, ["evaluation", "score"]), - ], - ) - def test_inputs( - self, custom_model_column: Union[str, None], inputs: List[str] - ) -> None: - step = ArenaHardResults( - name="arena_hard_results", - custom_model_column=custom_model_column, - pipeline=Pipeline(name="unit-test-pipeline"), - ) - assert step.inputs == inputs - - def test_process(self, caplog: LogCaptureFixture) -> None: - step = ArenaHardResults( - name="arena_hard_results", - custom_model_column="model_names", - pipeline=Pipeline(name="unit-test-pipeline"), - ) - step.load() - - with caplog.at_level(logging.INFO): - next( - step.process( - [ - { - "evaluation": "...", - "score": "A>>B", - "model_names": ["gpt-4-0314", "other-model"], - }, - { - "evaluation": "...", - "score": "A=B", - "model_names": ["gpt-4-0314", "other-model"], - }, - { - "evaluation": "...", - "score": "B>>A", - "model_names": ["gpt-4-0314", "other-model"], - }, - ] - ) - ) - assert ( - "Arena Hard ELO: other-model 1445.577347\ngpt-4-0314 1000.000000\ndtype: float64\n" - in caplog.text - ) - - def test_process_errors(self) -> None: - step = ArenaHardResults( - name="arena_hard_results", - custom_model_column="model_names", - pipeline=Pipeline(name="unit-test-pipeline"), - ) - step.load() - - with pytest.raises( - ValueError, - match="This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0.0", - ): - next( - step.process( - [ - { - "evaluation": "...", - "score": "A>>B", - "model_names": ["gpt-4-0314", "other-model"], - }, - { - "evaluation": "...", - "score": "B>>A", - "model_names": ["gpt-4-0314", "other-model"], - }, - ] - ) - ) diff --git a/tests/unit/test_imports.py b/tests/unit/test_imports.py index 94feb041ed..e20e186c8e 100644 --- a/tests/unit/test_imports.py +++ b/tests/unit/test_imports.py @@ -63,8 +63,6 @@ def test_imports() -> None: ) from distilabel.steps.tasks import ( - ArenaHard, - ArenaHardResults, Task, GeneratorTask, ChatItem,