From 300a67d5af6451ed9760e55dcf83ba26c95a6a17 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <alvaro@argilla.io>
Date: Mon, 8 Jan 2024 13:34:52 +0100
Subject: [PATCH] Documentation review (#223)

* Fix conflictive line-break within docstring example

* Add `PrometheusTask` docstrings

* Add missing `kwargs` type-hint to `Any`

* Fix line-break issues within docstrings

* Fix docstring formatting in `PrometheusTask`

* Fix referencing issues pointed out by `mkdocs`

* Add `UltraCMTask` docstrings

* Add references section within `{*}Task` docstrings

* Add disclaimer in `CritiqueTask` subclasses

* Exclude some modules from `gen_ref_pages.py`
---
 docs/index.md                                 |  2 +-
 docs/scripts/gen_ref_pages.py                 | 19 ++++++-
 docs/technical-reference/pipeline.md          |  2 +-
 src/distilabel/pipeline.py                    |  2 +-
 src/distilabel/tasks/critique/prometheus.py   | 50 +++++++++++++++++++
 src/distilabel/tasks/critique/ultracm.py      | 39 +++++++++++++++
 src/distilabel/tasks/preference/judgelm.py    |  8 ++-
 .../tasks/preference/ultrafeedback.py         |  6 ++-
 src/distilabel/tasks/preference/ultrajudge.py |  4 ++
 src/distilabel/tasks/prompt.py                |  2 +-
 .../tasks/text_generation/principles.py       |  3 +-
 .../tasks/text_generation/self_instruct.py    |  8 +--
 12 files changed, 134 insertions(+), 11 deletions(-)
diff --git a/docs/index.md b/docs/index.md
index d8302c83c9..3100bbd404 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -54,7 +54,7 @@ For a more complete example, check out our awesome notebook on Google Colab:
 
     Understand the components and their interactions.
 
--   <p align="center"> [**API Reference**](./reference/distilabel)</p>
+-   <p align="center"> [**API Reference**](./reference/distilabel/index.md)</p>
 
     ---
 
diff --git a/docs/scripts/gen_ref_pages.py b/docs/scripts/gen_ref_pages.py
index f249af899d..efe51411a8 100644
--- a/docs/scripts/gen_ref_pages.py
+++ b/docs/scripts/gen_ref_pages.py
@@ -1,4 +1,18 @@
-# https://mkdocstrings.github.io/recipes/#automatic-code-reference-pages
+# Copyright 2023-present, Argilla, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Ported from https://mkdocstrings.github.io/recipes/#automatic-code-reference-pages
 
 from pathlib import Path
 
@@ -7,8 +21,11 @@
 nav = mkdocs_gen_files.Nav()
 
 src = Path(__file__).parent.parent.parent / "src"
+excluded = ["distilabel/utils", "distilabel/logger.py", "distilabel/progress_bar.py"]
 
 for path in sorted(src.rglob("*.py")):
+    if any(path.name.__contains__(exclude) for exclude in excluded):
+        continue
     module_path = path.relative_to(src).with_suffix("")
     doc_path = path.relative_to(src).with_suffix(".md")
     full_doc_path = Path("reference", doc_path)
diff --git a/docs/technical-reference/pipeline.md b/docs/technical-reference/pipeline.md
index 93406211fe..8e1886ac80 100644
--- a/docs/technical-reference/pipeline.md
+++ b/docs/technical-reference/pipeline.md
@@ -21,7 +21,7 @@ We will create a [`Pipeline`][distilabel.pipeline.Pipeline] that will use [Notus
 --8<-- "docs/snippets/technical-reference/pipeline/pipeline_generator_1.py"
 ```
 
-We've set up our pipeline using a specialized [`TextGenerationTask`](distilabel.tasks.text_generation.base.TextGenerationTask) (refer to the [tasks section](./tasks.md) for more task details), and an [InferenceEndpointsLLM][distilabel.llm.huggingface.inference_endpoints.InferenceEndpointsLLM] configured for [`notus-7b-v1`](https://huggingface.co/argilla/notus-7b-v1), although any of the available `LLMs` will work.
+We've set up our pipeline using a specialized [TextGenerationTask][distilabel.tasks.text_generation.base.TextGenerationTask] (refer to the [tasks section](./tasks.md) for more task details), and an [InferenceEndpointsLLM][distilabel.llm.huggingface.inference_endpoints.InferenceEndpointsLLM] configured for [`notus-7b-v1`](https://huggingface.co/argilla/notus-7b-v1), although any of the available `LLMs` will work.
 
 To use the [Pipeline][distilabel.pipeline.Pipeline] for dataset generation, we call the generate method. We provide it with the input dataset and specify the desired number of generations. In this example, we've prepared a `Dataset` with a single row to illustrate the process. This dataset contains one row, and we'll trigger 2 generations from it:
 
diff --git a/src/distilabel/pipeline.py b/src/distilabel/pipeline.py
index 9dffec9c3a..3096283a62 100644
--- a/src/distilabel/pipeline.py
+++ b/src/distilabel/pipeline.py
@@ -746,7 +746,7 @@ def pipeline(
     *,
     generator: Optional["LLM"] = None,
     labeller: Optional["LLM"] = None,
-    **kwargs,
+    **kwargs: Any,
 ) -> Pipeline:
     """Creates a `Pipeline` instance with the provided LLMs for a given task, which is useful
     whenever you want to use a pre-defined `Pipeline` for a given task, or if you want to
diff --git a/src/distilabel/tasks/critique/prometheus.py b/src/distilabel/tasks/critique/prometheus.py
index 95a4910d22..5eb7908bfa 100644
--- a/src/distilabel/tasks/critique/prometheus.py
+++ b/src/distilabel/tasks/critique/prometheus.py
@@ -25,6 +25,28 @@
 
 @dataclass
 class PrometheusTask(CritiqueTask):
+    """A `CritiqueTask` following the prompt templated used by Prometheus.
+
+    Args:
+        system_prompt (str, optional): the system prompt to be used for generation. Defaults to `None`.
+        scoring_criteria (str): the scoring criteria to be used for the task, that defines
+            the scores below, provided via `score_descriptions`.
+        score_descriptions (Dict[int, str]): the descriptions of the scores, where
+            the key is the rating value (ideally those should be consecutive), and the
+            value is the description of each rating.
+
+    Disclaimer:
+        Since the Prometheus model has been trained with OpenAI API generated data, the prompting
+        strategy may just be consistent / compliant with either GPT-3.5 or GPT-4 from OpenAI API, or
+        with their own model. Any other model may fail on the generation of a structured output, as
+        well as providing an incorrect / inaccurate critique.
+
+    References:
+        - [`Prometheus: Inducing Fine-grained Evaluation Capability in Language Models`](https://arxiv.org/abs/2310.08491)
+        - [`kaist-ai/prometheus-13b-v1.0`](https://huggingface.co/kaist-ai/prometheus-7b-v1.0)
+        - [`kaist-ai/prometheus-13b-v1.0`](https://huggingface.co/kaist-ai/prometheus-13b-v1.0)
+    """
+
     scoring_criteria: str
     score_descriptions: Dict[int, str]
 
@@ -39,6 +61,34 @@ def input_args_names(self) -> List[str]:
     def generate_prompt(
         self, input: str, generations: str, ref_completion: str, **_: Any
     ) -> Prompt:
+        """Generates a prompt following the Prometheus specification.
+
+        Args:
+            input (str): the input to be used for the prompt.
+            generations (List[str]): the generations to be used for the prompt, in
+                this case, the ones to be critiqued.
+            ref_completion (str): the reference completion to be used for the prompt,
+                which is the reference one, assuming the one with the highest score.
+
+        Returns:
+            Prompt: the generated prompt.
+
+        Examples:
+            >>> from distilabel.tasks.critique import PrometheusTask
+            >>> task = PrometheusTask(
+            ...     scoring_criteria="Overall quality of the responses provided.",
+            ...     score_descriptions={0: "false", 1: "partially false", 2: "average", 3: "partially true", 4: "true"},
+            ... )
+            >>> task.generate_prompt(
+            ...     input="What are the first 5 Fibonacci numbers?",
+            ...     generations=["0 1 1 2 3", "0 1 1 2 3"],
+            ...     ref_completion="0 1 1 2 3",
+            ... )
+            Prompt(
+                system_prompt="You are a fair evaluator language model.",
+                formatted_prompt=""###Task Description:...",
+            )
+        """
         render_kwargs = {
             "instruction": input,
             "completion": generations,
diff --git a/src/distilabel/tasks/critique/ultracm.py b/src/distilabel/tasks/critique/ultracm.py
index a4fd751e14..d7dc08938b 100644
--- a/src/distilabel/tasks/critique/ultracm.py
+++ b/src/distilabel/tasks/critique/ultracm.py
@@ -28,6 +28,23 @@
 
 @dataclass
 class UltraCMTask(CritiqueTask):
+    """A `CritiqueTask` following the prompt templated used by UltraCM (from UltraFeedback).
+
+    Args:
+        system_prompt (str, optional): the system prompt to be used for generation. Defaults to `None`.
+
+    Disclaimer:
+        Since the UltraCM model has been trained with OpenAI API generated data, the prompting
+        strategy may just be consistent / compliant with either GPT-3.5 or GPT-4 from OpenAI API, or
+        with their own model. Any other model may fail on the generation of a structured output, as
+        well as providing an incorrect / inaccurate critique.
+
+    References:
+        - [`UltraFeedback: Boosting Language Models with High-quality Feedback`](https://arxiv.org/abs/2310.01377)
+        - [`UltraFeedback - GitHub Repository`](https://github.com/OpenBMB/UltraFeedback)
+        - [`openbmb/UltraCM-13b`](https://huggingface.co/openbmb/UltraCM-13b)
+    """
+
     __jinja2_template__: ClassVar[str] = _ULTRACM_TEMPLATE
 
     system_prompt: str = (
@@ -37,6 +54,28 @@ class UltraCMTask(CritiqueTask):
     )
 
     def generate_prompt(self, input: str, generations: str, **_: Any) -> Prompt:
+        """Generates a prompt following the UltraCM specification.
+
+        Args:
+            input (str): the input to be used for the prompt.
+            generations (List[str]): the generations to be used for the prompt, in
+                this case, the ones to be critiqued.
+
+        Returns:
+            Prompt: the generated prompt.
+
+        Examples:
+            >>> from distilabel.tasks.critique import UltraCMTask
+            >>> task = UltraCMTask()
+            >>> task.generate_prompt(
+            ...     input="What are the first 5 Fibonacci numbers?",
+            ...     generations=["0 1 1 2 3", "0 1 1 2 3"],
+            ... )
+            Prompt(
+                system_prompt="User: A one-turn chat between a curious user ...",
+                formatted_prompt="User: Given my answer to an instruction, your role ...",
+            )
+        """
         render_kwargs = {
             "instruction": input,
             "completion": generations,
diff --git a/src/distilabel/tasks/preference/judgelm.py b/src/distilabel/tasks/preference/judgelm.py
index 81754b0800..d4e494999b 100644
--- a/src/distilabel/tasks/preference/judgelm.py
+++ b/src/distilabel/tasks/preference/judgelm.py
@@ -36,6 +36,12 @@ class JudgeLMTask(PreferenceTask):
     Args:
         system_prompt (str, optional): the system prompt to be used for generation. Defaults to `None`.
         task_description (Union[str, None], optional): the description of the task. Defaults to `None`.
+
+    References:
+        - [`Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena`](https://arxiv.org/abs/2306.05685)
+        - [`BAAI/JudgeLM-7B-v1.0`](https://huggingface.co/BAAI/JudgeLM-7B-v1.0)
+        - [`BAAI/JudgeLM-13B-v1.0`](https://huggingface.co/BAAI/JudgeLM-13B-v1.0)
+        - [`BAAI/JudgeLM-33B-v1.0`](https://huggingface.co/BAAI/JudgeLM-33B-v1.0)
     """
 
     task_description: str = (
@@ -68,7 +74,7 @@ def generate_prompt(self, input: str, generations: List[str], **_: Any) -> Promp
             >>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"])
             Prompt(
                 system_prompt="You are a helpful assistant.",
-                formatted_prompt="[Question]\nWhat are the first 5 Fibonacci numbers?\n...",
+                formatted_prompt="[Question] What are the first 5 Fibonacci numbers? ...",
             )
         """
         render_kwargs = {
diff --git a/src/distilabel/tasks/preference/ultrafeedback.py b/src/distilabel/tasks/preference/ultrafeedback.py
index deec1d2249..7583d085a5 100644
--- a/src/distilabel/tasks/preference/ultrafeedback.py
+++ b/src/distilabel/tasks/preference/ultrafeedback.py
@@ -56,6 +56,10 @@ class UltraFeedbackTask(PreferenceTask):
         system_prompt (str, optional): the system prompt to be used for generation. Defaults to `None`.
         task_description (Union[str, None], optional): the description of the task. Defaults to `None`.
         ratings (Union[List[Rating], None], optional): the ratings to be used for the task. Defaults to `None`.
+
+    References:
+        - [`UltraFeedback: Boosting Language Models with High-quality Feedback`](https://arxiv.org/abs/2310.01377)
+        - [`UltraFeedback - GitHub Repository`](https://github.com/OpenBMB/UltraFeedback)
     """
 
     ratings: List[Rating]
@@ -92,7 +96,7 @@ def generate_prompt(self, input: str, generations: List[str], **_: Any) -> Promp
             >>> task.generate_prompt("What are the first 5 Fibonacci numbers?", ["0 1 1 2 3", "0 1 1 2 3"])
             Prompt(
                 system_prompt="Your role is to evaluate text quality based on given criteria.",
-                formatted_prompt="# General Text Quality Assessment\nEvaluate the model's ...",
+                formatted_prompt="# General Text Quality Assessment...",
             )
         """
         render_kwargs = {
diff --git a/src/distilabel/tasks/preference/ultrajudge.py b/src/distilabel/tasks/preference/ultrajudge.py
index 02d04e676f..c61a220e35 100644
--- a/src/distilabel/tasks/preference/ultrajudge.py
+++ b/src/distilabel/tasks/preference/ultrajudge.py
@@ -47,6 +47,10 @@ class UltraJudgeTask(PreferenceTask):
         task_description (Union[str, None], optional): the description of the task. Defaults to `None`.
         areas (List[str], optional): the areas to be used for the task. Defaults to a list of four areas:
             "Practical Accuracy", "Clarity & Transparency", "Authenticity & Reliability", and "Compliance with Intent".
+
+    References:
+        - [`UltraFeedback: Boosting Language Models with High-quality Feedback`](https://arxiv.org/abs/2310.01377)
+        - [`Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena`](https://arxiv.org/abs/2306.05685)
     """
 
     system_prompt: str = (
diff --git a/src/distilabel/tasks/prompt.py b/src/distilabel/tasks/prompt.py
index 010e38e9d0..5969cd66e7 100644
--- a/src/distilabel/tasks/prompt.py
+++ b/src/distilabel/tasks/prompt.py
@@ -66,7 +66,7 @@ def format_as(self, format: SupportedFormats) -> Union[str, List[ChatCompletion]
             ...     formatted_prompt="What are the first 5 Fibonacci numbers?",
             ... )
             >>> prompt.format_as("default")
-            'You are a helpful assistant.\nWhat are the first 5 Fibonacci numbers?'
+            'You are a helpful assistant. What are the first 5 Fibonacci numbers?'
         """
         if format == "default":
             return f"{self.system_prompt}\n{self.formatted_prompt}"
diff --git a/src/distilabel/tasks/text_generation/principles.py b/src/distilabel/tasks/text_generation/principles.py
index 27866a7a9a..cfc9ff7252 100644
--- a/src/distilabel/tasks/text_generation/principles.py
+++ b/src/distilabel/tasks/text_generation/principles.py
@@ -18,7 +18,8 @@ class UltraFeedbackPrinciples:
     be injected into the system prompt given to the LLM.
 
     References:
-        - https://github.com/OpenBMB/UltraFeedback
+        - [`UltraFeedback: Boosting Language Models with High-quality Feedback`](https://arxiv.org/abs/2310.01377)
+        - [`UltraFeedback - GitHub Repository`](https://github.com/OpenBMB/UltraFeedback)
     """
 
     helpfulness = [
diff --git a/src/distilabel/tasks/text_generation/self_instruct.py b/src/distilabel/tasks/text_generation/self_instruct.py
index 7f20c191ac..a0cb9d74c0 100644
--- a/src/distilabel/tasks/text_generation/self_instruct.py
+++ b/src/distilabel/tasks/text_generation/self_instruct.py
@@ -40,8 +40,6 @@ class SelfInstructTask(TextGenerationTask):
     """A `TextGenerationTask` following the Self-Instruct specification for building
     the prompts.
 
-    Reference: https://github.com/yizhongw/self-instruct
-
     Args:
         system_prompt (str, optional): the system prompt to be used. Defaults to `None`.
         principles (Dict[str, List[str]], optional): the principles to be used for the system prompt.
@@ -52,6 +50,10 @@ class SelfInstructTask(TextGenerationTask):
             "AI assistant".
         num_instructions (int, optional): the number of instructions to be used for the prompt.
             Defaults to 5.
+
+    References:
+        - [`Self-Instruct: Aligning Language Models with Self-Generated Instructions`](https://arxiv.org/abs/2212.10560)
+        - [`Self-Instruct - GitHub Repository`](https://github.com/yizhongw/self-instruct)
     """
 
     system_prompt: str = (
@@ -79,7 +81,7 @@ def generate_prompt(self, input: str, **_: Any) -> Prompt:
             >>> task.generate_prompt("What are the first 5 Fibonacci numbers?")
             Prompt(
                 system_prompt="You are a helpful assistant.",
-                formatted_prompt="# Task Description\nDevelop 2 user queries that ...",
+                formatted_prompt="# Task Description ...",
             )
         """
         render_kwargs = {