diff --git a/docs/sections/pipeline_samples/examples/index.md b/docs/sections/pipeline_samples/examples/index.md
index 19b2136278..ffcadb3199 100644
--- a/docs/sections/pipeline_samples/examples/index.md
+++ b/docs/sections/pipeline_samples/examples/index.md
@@ -60,3 +60,19 @@ Answer instructions with knowledge graphs defined as `pydantic.BaseModel` object
         ```
 
         ![Knowledge graph figure](../../../assets/images/sections/examples/knowledge-graph-example.png)
+
+
+### [Benchmarking with `distilabel`: Arena Hard](#benchmarking-with-distilabel-arena-hard)
+
+Benchmark LLMs with `distilabel`: reproducing the Arena Hard benchmark.
+
+??? Example "See example"
+
+    The script below first defines both the `ArenaHard` and the `ArenaHardResults` tasks, so as to generate responses for a given collection of prompts/questions with up to two LLMs, and then calculate the results as per the original implementation, respectively. Additionally, the second part of the example builds a `Pipeline` to run the generation on top of the prompts with `InferenceEndpointsLLM` while streaming the rest of the generations from a pre-computed set of GPT-4 generations, and then evaluate one against the other with `OpenAILLM` generating an alternate response, a comparison between the responses, and a result as A>>B, A>B, B>A, B>>A, or tie.
+
+    To run this example you will first need to install the Arena Hard optional dependencies, being `pandas`, `scikit-learn`, and `numpy`.
+
+    ```python title="arena_hard.py"
+    --8<-- "examples/arena_hard.py"
+    ```
+
diff --git a/src/distilabel/steps/tasks/benchmarks/arena_hard.py b/examples/arena_hard.py
similarity index 77%
rename from src/distilabel/steps/tasks/benchmarks/arena_hard.py
rename to examples/arena_hard.py
index 78cd7fa175..81bec55ace 100644
--- a/src/distilabel/steps/tasks/benchmarks/arena_hard.py
+++ b/examples/arena_hard.py
@@ -15,12 +15,11 @@
 import re
 from typing import Any, Dict, List, Optional, Union
 
-from typing_extensions import override
-
 from distilabel.steps import GlobalStep, StepInput
 from distilabel.steps.tasks.base import Task
 from distilabel.steps.tasks.typing import ChatType
 from distilabel.steps.typing import StepOutput
+from typing_extensions import override
 
 
 class ArenaHard(Task):
@@ -326,3 +325,134 @@ def process(self, inputs: StepInput) -> StepOutput:  # type: ignore
         # Here only so that if follow up steps are connected the inputs are preserved,
         # since this step doesn't modify nor generate new inputs
         yield inputs
+
+
+if __name__ == "__main__":
+    import json
+
+    from distilabel.llms import InferenceEndpointsLLM, OpenAILLM
+    from distilabel.pipeline import Pipeline
+    from distilabel.steps import (
+        CombineColumns,
+        KeepColumns,
+        LoadHubDataset,
+        StepInput,
+        step,
+    )
+    from distilabel.steps.tasks import TextGeneration
+    from distilabel.steps.typing import StepOutput
+
+    @step(inputs=["turns"], outputs=["system_prompt", "instruction"])
+    def PrepareForTextGeneration(*inputs: StepInput) -> StepOutput:
+        for input in inputs:
+            for item in input:
+                item["system_prompt"] = "You are a helpful assistant."
+                item["instruction"] = item["turns"][0]["content"]
+            yield input
+
+    @step(
+        inputs=["question_id"],
+        outputs=["generation", "generation_model"],
+        step_type="global",
+    )
+    def LoadReference(*inputs: StepInput) -> StepOutput:
+        # File downloaded from https://raw.githubusercontent.com/lm-sys/arena-hard-auto/e0a8ea1df42c1df76451a6cd04b14e31ff992b87/data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl
+        lines = open("gpt-4-0314.jsonl", mode="r").readlines()
+        for input in inputs:
+            for item in input:
+                for line in lines:
+                    data = json.loads(line)
+                    if data["question_id"] == item["question_id"]:
+                        item["generation"] = data["choices"][0]["turns"][0]["content"]
+                        item["generation_model"] = data["model_id"]
+                        break
+            yield input
+
+    with Pipeline(name="arena-hard-v0.1") as pipeline:
+        load_dataset = LoadHubDataset(
+            name="load_dataset",
+            repo_id="alvarobartt/lmsys-arena-hard-v0.1",
+            split="test",
+            num_examples=5,
+        )
+
+        load_reference = LoadReference(name="load_reference")
+
+        prepare = PrepareForTextGeneration(name="prepare")
+
+        text_generation_cohere = TextGeneration(
+            name="text_generation_cohere",
+            llm=InferenceEndpointsLLM(
+                model_id="CohereForAI/c4ai-command-r-plus",
+                tokenizer_id="CohereForAI/c4ai-command-r-plus",
+            ),
+            use_system_prompt=True,
+            input_batch_size=10,
+            output_mappings={"model_name": "generation_model"},
+        )
+
+        combine_columns = CombineColumns(
+            name="combine_columns",
+            columns=["generation", "generation_model"],
+            output_columns=["generations", "generation_models"],
+        )
+
+        arena_hard = ArenaHard(
+            name="arena_hard",
+            llm=OpenAILLM(model="gpt-4-1106-preview"),
+            output_mappings={"model_name": "evaluation_model"},
+        )
+
+        keep_columns = KeepColumns(
+            name="keep_columns",
+            columns=[
+                "question_id",
+                "category",
+                "cluster",
+                "system_prompt",
+                "instruction",
+                "generations",
+                "generation_models",
+                "evaluation",
+                "score",
+                "evaluation_model",
+            ],
+        )
+
+        win_rates = ArenaHardResults(
+            name="win_rates", custom_model_column="generation_models"
+        )
+
+        load_dataset >> load_reference  # type: ignore
+        load_dataset >> prepare >> text_generation_cohere  # type: ignore
+        (  # type: ignore
+            [load_reference, text_generation_cohere]
+            >> combine_columns
+            >> arena_hard
+            >> keep_columns
+            >> win_rates
+        )
+
+        distiset = pipeline.run(
+            parameters={  # type: ignore
+                text_generation_cohere.name: {
+                    "llm": {
+                        "generation_kwargs": {
+                            "temperature": 0.7,
+                            "max_new_tokens": 4096,
+                            "stop_sequences": ["<EOS_TOKEN>", "<|END_OF_TURN_TOKEN|>"],
+                        }
+                    }
+                },
+                arena_hard.name: {
+                    "llm": {
+                        "generation_kwargs": {
+                            "temperature": 0.0,
+                            "max_new_tokens": 4096,
+                        }
+                    }
+                },
+            },
+        )
+        if distiset is not None:
+            distiset.push_to_hub("arena-hard-results")
diff --git a/pyproject.toml b/pyproject.toml
index 50c858462c..b94387ff51 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -87,9 +87,6 @@ outlines = ["outlines >= 0.0.40"]
 vertexai = ["google-cloud-aiplatform >= 1.38.0"]
 vllm = ["vllm >= 0.4.0", "outlines == 0.0.34", "filelock >= 3.13.4"]
 
-# Other optional dependencies
-arena-hard = ["pandas", "numpy", "scikit-learn"]
-
 [project.urls]
 Documentation = "https://distilabel.argilla.io/"
 Issues = "https://github.com/argilla/distilabel/issues"
diff --git a/scripts/install_dependencies.sh b/scripts/install_dependencies.sh
index 3372bd60fd..4da6ad9dd4 100755
--- a/scripts/install_dependencies.sh
+++ b/scripts/install_dependencies.sh
@@ -6,7 +6,7 @@ python_version=$(python -c "import sys; print(sys.version_info[:2])")
 
 python -m pip install uv
 
-uv pip install --system -e ".[dev,tests,anthropic,arena-hard,argilla,cohere,groq,hf-inference-endpoints,hf-transformers,litellm,llama-cpp,ollama,openai,outlines,vertexai]"
+uv pip install --system -e ".[dev,tests,anthropic,argilla,cohere,groq,hf-inference-endpoints,hf-transformers,litellm,llama-cpp,ollama,openai,outlines,vertexai]"
 if [ "${python_version}" != "(3, 8)" ]; then
 	uv pip install --system -e .[mistralai,instructor]
 fi
diff --git a/src/distilabel/steps/tasks/__init__.py b/src/distilabel/steps/tasks/__init__.py
index d1bccf08f2..b2456d7824 100644
--- a/src/distilabel/steps/tasks/__init__.py
+++ b/src/distilabel/steps/tasks/__init__.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 from distilabel.steps.tasks.base import GeneratorTask, Task
-from distilabel.steps.tasks.benchmarks.arena_hard import ArenaHard, ArenaHardResults
 from distilabel.steps.tasks.complexity_scorer import ComplexityScorer
 from distilabel.steps.tasks.evol_instruct.base import EvolInstruct
 from distilabel.steps.tasks.evol_instruct.evol_complexity.base import EvolComplexity
@@ -47,8 +46,6 @@
 from distilabel.steps.tasks.ultrafeedback import UltraFeedback
 
 __all__ = [
-    "ArenaHard",
-    "ArenaHardResults",
     "GeneratorTask",
     "Task",
     "ComplexityScorer",
diff --git a/src/distilabel/steps/tasks/benchmarks/__init__.py b/src/distilabel/steps/tasks/benchmarks/__init__.py
deleted file mode 100644
index 2598794f29..0000000000
--- a/src/distilabel/steps/tasks/benchmarks/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright 2023-present, Argilla, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/tests/unit/steps/tasks/benchmarks/__init__.py b/tests/unit/steps/tasks/benchmarks/__init__.py
deleted file mode 100644
index 2598794f29..0000000000
--- a/tests/unit/steps/tasks/benchmarks/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright 2023-present, Argilla, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/tests/unit/steps/tasks/benchmarks/test_arena_hard.py b/tests/unit/steps/tasks/benchmarks/test_arena_hard.py
deleted file mode 100644
index 40666e4402..0000000000
--- a/tests/unit/steps/tasks/benchmarks/test_arena_hard.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright 2023-present, Argilla, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-from typing import Any, Dict, List, Union
-
-import pytest
-from _pytest.logging import LogCaptureFixture
-from distilabel.pipeline.local import Pipeline
-from distilabel.steps.tasks.benchmarks.arena_hard import ArenaHard, ArenaHardResults
-
-from tests.unit.conftest import DummyLLM
-
-
-class TestArenaHard:
-    def test_format_input(self) -> None:
-        task = ArenaHard(
-            name="arena_hard",
-            llm=DummyLLM(),
-            pipeline=Pipeline(name="unit-test-pipeline"),
-        )
-        task.load()
-
-        result = task.format_input(
-            input={
-                "instruction": "INSTRUCTION",
-                "generations": ["GENERATION_A", "GENERATION_B"],
-            }
-        )
-
-        assert result[-1] == {
-            "role": "user",
-            "content": "<|User Prompt|>\nINSTRUCTION\n\n<|The Start of Assistant A's Answer|>\nGENERATION_A\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\nGENERATION_B\n<|The End of Assistant B's Answer|>",
-        }
-
-    @pytest.mark.parametrize(
-        "output, expected",
-        [
-            (
-                "My own answer to the prompt would be:\nANSWER\nMy final veredict is: [[A>>B]]\n",
-                {
-                    "evaluation": "My own answer to the prompt would be:\nANSWER\nMy final veredict is: [[A>>B]]\n",
-                    "score": "A>>B",
-                },
-            ),
-            (
-                "My own answer to the prompt would be:\nANSWER\nMy final veredict is: TIE\n",
-                {
-                    "evaluation": "My own answer to the prompt would be:\nANSWER\nMy final veredict is: TIE\n",
-                    "score": None,
-                },
-            ),
-            (
-                None,
-                {"evaluation": None, "score": None},
-            ),
-        ],
-    )
-    def test_format_output(
-        self, output: Union[str, None], expected: Dict[str, Any]
-    ) -> None:
-        task = ArenaHard(
-            name="arena_hard",
-            llm=DummyLLM(),
-            pipeline=Pipeline(name="unit-test-pipeline"),
-        )
-        task.load()
-
-        assert (
-            task.format_output(
-                output=output,
-                input={
-                    "instruction": "INSTRUCTION",
-                    "generations": ["GENERATION_A", "GENERATION_B"],
-                },
-            )
-            == expected
-        )
-
-
-class TestArenaHardResults:
-    @pytest.mark.parametrize(
-        "custom_model_column, inputs",
-        [
-            ("model_name", ["evaluation", "score", "model_name"]),
-            (None, ["evaluation", "score"]),
-        ],
-    )
-    def test_inputs(
-        self, custom_model_column: Union[str, None], inputs: List[str]
-    ) -> None:
-        step = ArenaHardResults(
-            name="arena_hard_results",
-            custom_model_column=custom_model_column,
-            pipeline=Pipeline(name="unit-test-pipeline"),
-        )
-        assert step.inputs == inputs
-
-    def test_process(self, caplog: LogCaptureFixture) -> None:
-        step = ArenaHardResults(
-            name="arena_hard_results",
-            custom_model_column="model_names",
-            pipeline=Pipeline(name="unit-test-pipeline"),
-        )
-        step.load()
-
-        with caplog.at_level(logging.INFO):
-            next(
-                step.process(
-                    [
-                        {
-                            "evaluation": "...",
-                            "score": "A>>B",
-                            "model_names": ["gpt-4-0314", "other-model"],
-                        },
-                        {
-                            "evaluation": "...",
-                            "score": "A=B",
-                            "model_names": ["gpt-4-0314", "other-model"],
-                        },
-                        {
-                            "evaluation": "...",
-                            "score": "B>>A",
-                            "model_names": ["gpt-4-0314", "other-model"],
-                        },
-                    ]
-                )
-            )
-        assert (
-            "Arena Hard ELO: other-model    1445.577347\ngpt-4-0314     1000.000000\ndtype: float64\n"
-            in caplog.text
-        )
-
-    def test_process_errors(self) -> None:
-        step = ArenaHardResults(
-            name="arena_hard_results",
-            custom_model_column="model_names",
-            pipeline=Pipeline(name="unit-test-pipeline"),
-        )
-        step.load()
-
-        with pytest.raises(
-            ValueError,
-            match="This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0.0",
-        ):
-            next(
-                step.process(
-                    [
-                        {
-                            "evaluation": "...",
-                            "score": "A>>B",
-                            "model_names": ["gpt-4-0314", "other-model"],
-                        },
-                        {
-                            "evaluation": "...",
-                            "score": "B>>A",
-                            "model_names": ["gpt-4-0314", "other-model"],
-                        },
-                    ]
-                )
-            )
diff --git a/tests/unit/test_imports.py b/tests/unit/test_imports.py
index 94feb041ed..e20e186c8e 100644
--- a/tests/unit/test_imports.py
+++ b/tests/unit/test_imports.py
@@ -63,8 +63,6 @@ def test_imports() -> None:
     )
 
     from distilabel.steps.tasks import (
-        ArenaHard,
-        ArenaHardResults,
         Task,
         GeneratorTask,
         ChatItem,