Skip to content

Commit

Permalink
Add examples/arena_hard.py and remove from distilabel core (#741)
Browse files Browse the repository at this point in the history
* Remove `arena-hard` extras

* Remove `ArenaHard` and `ArenaHardResults`

* Add `examples/arena_hard.py`

* Add `arena_hard.py` example to `docs`

* Remove files included due to merge conflict
  • Loading branch information
alvarobartt authored Jun 18, 2024
1 parent 356a4a3 commit 6bf14d0
Show file tree
Hide file tree
Showing 9 changed files with 149 additions and 209 deletions.
16 changes: 16 additions & 0 deletions docs/sections/pipeline_samples/examples/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,19 @@ Answer instructions with knowledge graphs defined as `pydantic.BaseModel` object
```

![Knowledge graph figure](../../../assets/images/sections/examples/knowledge-graph-example.png)


### [Benchmarking with `distilabel`: Arena Hard](#benchmarking-with-distilabel-arena-hard)

Benchmark LLMs with `distilabel`: reproducing the Arena Hard benchmark.

??? Example "See example"

The script below first defines both the `ArenaHard` and the `ArenaHardResults` tasks, so as to generate responses for a given collection of prompts/questions with up to two LLMs, and then calculate the results as per the original implementation, respectively. Additionally, the second part of the example builds a `Pipeline` to run the generation on top of the prompts with `InferenceEndpointsLLM` while streaming the rest of the generations from a pre-computed set of GPT-4 generations, and then evaluate one against the other with `OpenAILLM` generating an alternate response, a comparison between the responses, and a result as A>>B, A>B, B>A, B>>A, or tie.

To run this example you will first need to install the Arena Hard optional dependencies, being `pandas`, `scikit-learn`, and `numpy`.

```python title="arena_hard.py"
--8<-- "examples/arena_hard.py"
```

Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,11 @@
import re
from typing import Any, Dict, List, Optional, Union

from typing_extensions import override

from distilabel.steps import GlobalStep, StepInput
from distilabel.steps.tasks.base import Task
from distilabel.steps.tasks.typing import ChatType
from distilabel.steps.typing import StepOutput
from typing_extensions import override


class ArenaHard(Task):
Expand Down Expand Up @@ -326,3 +325,134 @@ def process(self, inputs: StepInput) -> StepOutput: # type: ignore
# Here only so that if follow up steps are connected the inputs are preserved,
# since this step doesn't modify nor generate new inputs
yield inputs


if __name__ == "__main__":
import json

from distilabel.llms import InferenceEndpointsLLM, OpenAILLM
from distilabel.pipeline import Pipeline
from distilabel.steps import (
CombineColumns,
KeepColumns,
LoadHubDataset,
StepInput,
step,
)
from distilabel.steps.tasks import TextGeneration
from distilabel.steps.typing import StepOutput

@step(inputs=["turns"], outputs=["system_prompt", "instruction"])
def PrepareForTextGeneration(*inputs: StepInput) -> StepOutput:
for input in inputs:
for item in input:
item["system_prompt"] = "You are a helpful assistant."
item["instruction"] = item["turns"][0]["content"]
yield input

@step(
inputs=["question_id"],
outputs=["generation", "generation_model"],
step_type="global",
)
def LoadReference(*inputs: StepInput) -> StepOutput:
# File downloaded from https://raw.githubusercontent.com/lm-sys/arena-hard-auto/e0a8ea1df42c1df76451a6cd04b14e31ff992b87/data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl
lines = open("gpt-4-0314.jsonl", mode="r").readlines()
for input in inputs:
for item in input:
for line in lines:
data = json.loads(line)
if data["question_id"] == item["question_id"]:
item["generation"] = data["choices"][0]["turns"][0]["content"]
item["generation_model"] = data["model_id"]
break
yield input

with Pipeline(name="arena-hard-v0.1") as pipeline:
load_dataset = LoadHubDataset(
name="load_dataset",
repo_id="alvarobartt/lmsys-arena-hard-v0.1",
split="test",
num_examples=5,
)

load_reference = LoadReference(name="load_reference")

prepare = PrepareForTextGeneration(name="prepare")

text_generation_cohere = TextGeneration(
name="text_generation_cohere",
llm=InferenceEndpointsLLM(
model_id="CohereForAI/c4ai-command-r-plus",
tokenizer_id="CohereForAI/c4ai-command-r-plus",
),
use_system_prompt=True,
input_batch_size=10,
output_mappings={"model_name": "generation_model"},
)

combine_columns = CombineColumns(
name="combine_columns",
columns=["generation", "generation_model"],
output_columns=["generations", "generation_models"],
)

arena_hard = ArenaHard(
name="arena_hard",
llm=OpenAILLM(model="gpt-4-1106-preview"),
output_mappings={"model_name": "evaluation_model"},
)

keep_columns = KeepColumns(
name="keep_columns",
columns=[
"question_id",
"category",
"cluster",
"system_prompt",
"instruction",
"generations",
"generation_models",
"evaluation",
"score",
"evaluation_model",
],
)

win_rates = ArenaHardResults(
name="win_rates", custom_model_column="generation_models"
)

load_dataset >> load_reference # type: ignore
load_dataset >> prepare >> text_generation_cohere # type: ignore
( # type: ignore
[load_reference, text_generation_cohere]
>> combine_columns
>> arena_hard
>> keep_columns
>> win_rates
)

distiset = pipeline.run(
parameters={ # type: ignore
text_generation_cohere.name: {
"llm": {
"generation_kwargs": {
"temperature": 0.7,
"max_new_tokens": 4096,
"stop_sequences": ["<EOS_TOKEN>", "<|END_OF_TURN_TOKEN|>"],
}
}
},
arena_hard.name: {
"llm": {
"generation_kwargs": {
"temperature": 0.0,
"max_new_tokens": 4096,
}
}
},
},
)
if distiset is not None:
distiset.push_to_hub("arena-hard-results")
3 changes: 0 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,6 @@ outlines = ["outlines >= 0.0.40"]
vertexai = ["google-cloud-aiplatform >= 1.38.0"]
vllm = ["vllm >= 0.4.0", "outlines == 0.0.34", "filelock >= 3.13.4"]

# Other optional dependencies
arena-hard = ["pandas", "numpy", "scikit-learn"]

[project.urls]
Documentation = "https://distilabel.argilla.io/"
Issues = "https://github.com/argilla/distilabel/issues"
Expand Down
2 changes: 1 addition & 1 deletion scripts/install_dependencies.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ python_version=$(python -c "import sys; print(sys.version_info[:2])")

python -m pip install uv

uv pip install --system -e ".[dev,tests,anthropic,arena-hard,argilla,cohere,groq,hf-inference-endpoints,hf-transformers,litellm,llama-cpp,ollama,openai,outlines,vertexai]"
uv pip install --system -e ".[dev,tests,anthropic,argilla,cohere,groq,hf-inference-endpoints,hf-transformers,litellm,llama-cpp,ollama,openai,outlines,vertexai]"
if [ "${python_version}" != "(3, 8)" ]; then
uv pip install --system -e .[mistralai,instructor]
fi
Expand Down
3 changes: 0 additions & 3 deletions src/distilabel/steps/tasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
# limitations under the License.

from distilabel.steps.tasks.base import GeneratorTask, Task
from distilabel.steps.tasks.benchmarks.arena_hard import ArenaHard, ArenaHardResults
from distilabel.steps.tasks.complexity_scorer import ComplexityScorer
from distilabel.steps.tasks.evol_instruct.base import EvolInstruct
from distilabel.steps.tasks.evol_instruct.evol_complexity.base import EvolComplexity
Expand Down Expand Up @@ -47,8 +46,6 @@
from distilabel.steps.tasks.ultrafeedback import UltraFeedback

__all__ = [
"ArenaHard",
"ArenaHardResults",
"GeneratorTask",
"Task",
"ComplexityScorer",
Expand Down
13 changes: 0 additions & 13 deletions src/distilabel/steps/tasks/benchmarks/__init__.py

This file was deleted.

13 changes: 0 additions & 13 deletions tests/unit/steps/tasks/benchmarks/__init__.py

This file was deleted.

Loading

0 comments on commit 6bf14d0

Please sign in to comment.