Skip to content

Commit

Permalink
Add ArenaHard benchmark and ArenaHardResults step (#670)
Browse files Browse the repository at this point in the history
* Fix `ChatGeneration.format_input` exception

* Bump `datasets` to 2.16.0 or higher

To be able to efficiently use the cache via `load_dataset` whenever there's no connection

* Add `benchmarks/arena_hard.py` (WIP)

* Update `_get_hf_dataset_info` error message

* Add `ArenaHard` and `ArenaHardResults` docstrings

* Catch `ImportError` on `ArenaHardResults.load`

* Add `ArenaHard` and `ArenaHardEval` imports

* Add `arena-hard` extras

* Install `arena-hard` extra in `test.yml`

* Update `arena-hard` extra dependencies

* Fix circular import in `arena_hard.py`

* Apply suggestions from code review

* Add missing examples in docstrings & fix type-hints

* Add some future TODOs

* Update `install_dependencies.sh` to install `arena-hard` extra

* Add `ArenaHard` and `ArenaHardResults` unit tests
  • Loading branch information
alvarobartt authored Jun 12, 2024
1 parent 0e8c752 commit d32d664
Show file tree
Hide file tree
Showing 10 changed files with 532 additions and 4 deletions.
7 changes: 6 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@ classifiers = [
"Programming Language :: Python :: Implementation :: PyPy",
]
dependencies = [
"datasets >= 2.14.0",
# Bump `datasets` to support `load_dataset` from cache
# Ref https://github.com/huggingface/datasets/releases/tag/2.16.0
"datasets >= 2.16.0",
"httpx >= 0.25.2",
"importlib-resources >= 6.1.1; python_version < '3.9'",
"Jinja2 >= 3.1.2",
Expand Down Expand Up @@ -83,6 +85,9 @@ outlines = ["outlines >= 0.0.40"]
vertexai = ["google-cloud-aiplatform >= 1.38.0"]
vllm = ["vllm >= 0.4.0", "outlines == 0.0.34", "filelock >= 3.13.4"]

# Other optional dependencies
arena-hard = ["pandas", "numpy", "scikit-learn"]

[project.urls]
Documentation = "https://distilabel.argilla.io/"
Issues = "https://github.com/argilla/distilabel/issues"
Expand Down
2 changes: 1 addition & 1 deletion scripts/install_dependencies.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ python_version=$(python -c "import sys; print(sys.version_info[:2])")

python -m pip install uv

uv pip install --system -e ".[dev,tests,anthropic,argilla,cohere,groq,hf-inference-endpoints,hf-transformers,litellm,llama-cpp,ollama,openai,outlines,vertexai]"
uv pip install --system -e ".[dev,tests,anthropic,arena-hard,argilla,cohere,groq,hf-inference-endpoints,hf-transformers,litellm,llama-cpp,ollama,openai,outlines,vertexai]"
if [ "${python_version}" != "(3, 8)" ]; then
uv pip install --system -e .[mistralai,instructor]
fi
Expand Down
2 changes: 1 addition & 1 deletion src/distilabel/steps/generators/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ def _dataset_info(self) -> Dict[str, DatasetInfo]:
class LoadHubDataset(LoadDataFromHub):
def __init__(self, **data: Any) -> None:
warnings.warn(
"`LoadHubDataset` is deprecated and will be removed in version 1.3.0, use `LoadFromHub` instead.",
"`LoadHubDataset` is deprecated and will be removed in version 1.3.0, use `LoadDataFromHub` instead.",
DeprecationWarning,
stacklevel=2,
)
Expand Down
3 changes: 3 additions & 0 deletions src/distilabel/steps/tasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

from distilabel.steps.tasks.base import GeneratorTask, Task
from distilabel.steps.tasks.benchmarks.arena_hard import ArenaHard, ArenaHardResults
from distilabel.steps.tasks.complexity_scorer import ComplexityScorer
from distilabel.steps.tasks.evol_instruct.base import EvolInstruct
from distilabel.steps.tasks.evol_instruct.evol_complexity.base import EvolComplexity
Expand Down Expand Up @@ -46,6 +47,8 @@
from distilabel.steps.tasks.ultrafeedback import UltraFeedback

__all__ = [
"ArenaHard",
"ArenaHardResults",
"GeneratorTask",
"Task",
"ComplexityScorer",
Expand Down
13 changes: 13 additions & 0 deletions src/distilabel/steps/tasks/benchmarks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright 2023-present, Argilla, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
320 changes: 320 additions & 0 deletions src/distilabel/steps/tasks/benchmarks/arena_hard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,320 @@
# Copyright 2023-present, Argilla, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
from typing import Any, Dict, List, Optional, Union

from typing_extensions import override

from distilabel.steps import GlobalStep, StepInput
from distilabel.steps.tasks.base import Task
from distilabel.steps.tasks.typing import ChatType
from distilabel.steps.typing import StepOutput


class ArenaHard(Task):
"""This `Task` is based on the "From Live Data to High-Quality Benchmarks: The
Arena-Hard Pipeline" paper that presents Arena Hard, which is a benchmark for
instruction-tuned LLMs that contains 500 challenging user queries. GPT-4 is used
as the judge to compare the model responses against a baseline model, which defaults
to `gpt-4-0314`.
Note:
Arena-Hard-Auto has the highest correlation and separability to Chatbot Arena
among popular open-ended LLM benchmarks.
Input columns:
- instruction (`str`): The instruction to evaluate the responses.
- generations (`List[str]`): The responses generated by two, and only two, LLMs.
Output columns:
- evaluation (`str`): The evaluation of the responses generated by the LLMs.
- score (`str`): The score extracted from the evaluation.
- model_name (`str`): The model name used to generate the evaluation.
Categories:
- benchmark
References:
- [From Live Data to High-Quality Benchmarks: The Arena-Hard Pipeline](https://lmsys.org/blog/2024-04-19-arena-hard/)
- [`arena-hard-auto`](https://github.com/lm-sys/arena-hard-auto/tree/main)
Examples:
Evaluate two assistant responses for a given instruction using Arean Hard prompts:
```python
from distilabel.pipeline import Pipeline
from distilabel.steps import CombineColumns, LoadDataFromDicts
from distilabel.steps.tasks import ArenaHard, TextGeneration
with Pipeline() as pipeline:
load_data = LoadDataFromDicts(
data=[{"instruction": "What is the capital of France?"}],
)
text_generation_a = TextGeneration(
llm=..., # LLM instance
output_mappings={"model_name": "generation_model"},
)
text_generation_b = TextGeneration(
llm=..., # LLM instance
output_mappings={"model_name": "generation_model"},
)
combine = CombineColumns(
columns=["generation", "generation_model"],
output_columns=["generations", "generation_models"],
)
arena_hard = ArenaHard(
llm=..., # LLM instance
)
load_data >> [text_generation_a, text_generation_b] >> combine >> arena_hard
```
"""

@property
def inputs(self) -> List[str]:
"""The inputs required by this task are the `instruction` and the `generations`,
which are the responses generated by two, and only two, LLMs."""
return ["instruction", "generations"]

def format_input(self, input: Dict[str, Any]) -> ChatType:
"""This method formats the input data as a `ChatType` using the prompt defined
by the Arena Hard benchmark, which consists on a `system_prompt` plus a template
for the user first message that contains the `instruction` and both `generations`.
"""
return [
{
"role": "system",
"content": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\".",
},
{
"role": "user",
"content": f"<|User Prompt|>\n{input['instruction']}\n\n<|The Start of Assistant A's Answer|>\n{input['generations'][0]}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{input['generations'][1]}\n<|The End of Assistant B's Answer|>",
},
]

@property
def outputs(self) -> List[str]:
"""The outputs generated by this task are the `evaluation`, the `score` and
the `model_name` (which is automatically injected within the `process` method
of the parent task)."""
return ["evaluation", "score", "model_name"]

def format_output(
self,
output: Union[str, None],
input: Union[Dict[str, Any], None] = None,
) -> Dict[str, Any]:
"""This method formats the output generated by the LLM as a Python dictionary
containing the `evaluation` which is the raw output generated by the LLM (consisting
of the judge LLM alternate generation for the given instruction, plus an explanation
on the evaluation of the given responses; plus the `score` extracted from the output.
Args:
output: the raw output of the LLM.
input: the input to the task. Is provided in case it needs to be used to enrich
the output if needed.
Returns:
A dict with the keys `evaluation` with the raw output which contains the LLM
evaluation and the extracted `score` if possible.
"""
if output is None:
return {"evaluation": None, "score": None}
pattern = re.compile(r"\[\[([AB<>=]+)\]\]")
match = pattern.search(output)
if match is None:
return {"evaluation": output, "score": None}
return {"evaluation": output, "score": match.group(1)}


class ArenaHardResults(GlobalStep):
"""This `Step` is based on the "From Live Data to High-Quality Benchmarks: The
Arena-Hard Pipeline" paper that presents Arena Hard, which is a benchmark for
instruction-tuned LLMs that contains 500 challenging user queries. This step is
a `GlobalStep` that should run right after the `ArenaHard` task to calculate the
ELO scores for the evaluated models.
Note:
Arena-Hard-Auto has the highest correlation and separability to Chatbot Arena
among popular open-ended LLM benchmarks.
References:
- [From Live Data to High-Quality Benchmarks: The Arena-Hard Pipeline](https://lmsys.org/blog/2024-04-19-arena-hard/)
- [`arena-hard-auto`](https://github.com/lm-sys/arena-hard-auto/tree/main)
Examples:
Rate the ELO scores for two assistant responses for a given an evaluation / comparison between both using Arean Hard prompts:
```python
from distilabel.pipeline import Pipeline
from distilabel.steps import CombineColumns, LoadDataFromDicts
from distilabel.steps.tasks import ArenaHard, TextGeneration
with Pipeline() as pipeline:
load_data = LoadDataFromDicts(
data=[{"instruction": "What is the capital of France?"}],
)
text_generation_a = TextGeneration(
llm=..., # LLM instance
output_mappings={"model_name": "generation_model"},
)
text_generation_b = TextGeneration(
llm=..., # LLM instance
output_mappings={"model_name": "generation_model"},
)
combine = CombineColumns(
columns=["generation", "generation_model"],
output_columns=["generations", "generation_models"],
)
arena_hard = ArenaHard(
llm=..., # LLM instance
)
arena_hard_results = ArenaHardResults(
custom_model_column="generation_models",
custom_weights={"A>B": 1, "A>>B": 3, "B>A": 1, "B>>A": 3},
)
load_data >> [text_generation_a, text_generation_b] >> combine >> arena_hard >> arena_hard_results
```
"""

custom_model_column: Optional[str] = None
custom_weights: Dict[str, int] = {"A>B": 1, "A>>B": 3, "B>A": 1, "B>>A": 3}

def load(self) -> None:
"""Ensures that the required dependencies are installed."""
super().load()

try:
import numpy as np # noqa: F401
import pandas as pd # noqa: F401
from sklearn.linear_model import LogisticRegression # noqa: F401
except ImportError as e:
raise ImportError(
"In order to run `ArenaHardResults`, the `arena-hard` extra dependencies"
" must be installed i.e. `numpy`, `pandas`, and `scikit-learn`.\n"
"Please install the dependencies by running `pip install distilabel[arena-hard]`."
) from e

# TODO: the `evaluation` is not really required as an input, so it could be removed, since
# only `score` is used / required
@property
def inputs(self) -> List[str]:
"""The inputs required by this step are the `evaluation` and the `score` generated
by the `ArenaHard` task. Since this step does use the identifiers `model_a` and `model_b`,
optionally one can set `custom_model_column` to use the model names if existing within
the input data, ideally this value should be `model_name` if connected from the `ArenaHard`
step."""
columns = ["evaluation", "score"]
if self.custom_model_column:
columns.append(self.custom_model_column)
return columns

@override
def process(self, inputs: StepInput) -> StepOutput: # type: ignore
"""This method processes the inputs generated by the `ArenaHard` task to calculate the
win rates for each of the models to evaluate. Since this step inherits from the `GlobalStep`,
it will wait for all the input batches to be processed, and then the output will be yielded in
case there's a follow up step, since this step won't modify the received inputs.
Args:
inputs: A list of Python dictionaries with the inputs of the task.
Yields:
A list of Python dictionaries with the outputs of the task.
References:
- https://github.com/lm-sys/arena-hard-auto/blob/main/show_result.py
"""
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

models = ["A", "B"]
if self.custom_model_column:
models = inputs[0][self.custom_model_column]

# TODO: the battles are only calculated for the first game, even though the official
# implementation also covers the possibility of a second game (not within the released
# dataset yet)
battles = pd.DataFrame()
for input in inputs:
output = {
# TODO: "question_id": input["question_id"],
"model_a": models[0],
"model_b": models[1],
}
if input["score"] in ["A>B", "A>>B"]:
output["winner"] = models[0]
rows = [output] * self.custom_weights[input["score"]]
elif input["score"] in ["B>A", "B>>A"]:
output["winner"] = models[1]
rows = [output] * self.custom_weights[input["score"]]
elif input["score"] == "A=B":
output["winner"] = "tie"
rows = [output]
else:
continue

battles = pd.concat([battles, pd.DataFrame(rows)])

models = pd.concat([battles["model_a"], battles["model_b"]]).unique()
models = pd.Series(np.arange(len(models)), index=models)

battles = pd.concat([battles, battles], ignore_index=True)
p = len(models.index)
n = battles.shape[0]

X = np.zeros([n, p])
X[np.arange(n), models[battles["model_a"]]] = +np.log(10)
X[np.arange(n), models[battles["model_b"]]] = -np.log(10)

Y = np.zeros(n)
Y[battles["winner"] == "model_a"] = 1.0

tie_idx = battles["winner"] == "tie"
tie_idx[len(tie_idx) // 2 :] = False
Y[tie_idx] = 1.0

lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-8) # type: ignore
lr.fit(X, Y)

# The ELO scores are calculated assuming that the reference is `gpt-4-0314`
# with an starting ELO of 1000, so that the evaluated models are compared with
# `gtp-4-0314` only if it's available within the models
elo_scores = 400 * lr.coef_[0] + 1000
# TODO: we could parametrize the reference / anchor model, but left as is to be faithful to the
# original implementation
if "gpt-4-0314" in models.index:
elo_scores += 1000 - elo_scores[models["gpt-4-0314"]]

output = pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
self._logger.info(f"Arena Hard ELO: {output}")

# Here only so that if follow up steps are connected the inputs are preserved,
# since this step doesn't modify nor generate new inputs
yield inputs
2 changes: 1 addition & 1 deletion src/distilabel/steps/tasks/text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def format_input(self, input: Dict[str, Any]) -> ChatType:

if not is_openai_format(input["messages"]):
raise ValueError(
"Input `instruction` must be a string or an OpenAI chat-like format. "
"Input `messages` must be an OpenAI chat-like format conversation. "
f"Got: {input['messages']}. Please check: 'https://cookbook.openai.com/examples/how_to_format_inputs_to_chatgpt_models'."
)

Expand Down
Loading

0 comments on commit d32d664

Please sign in to comment.