From b6d57f05b488c356340e187f3a7703eae0625720 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <alvaro@argilla.io>
Date: Wed, 27 Dec 2023 12:07:38 +0100
Subject: [PATCH] Add `CritiqueTask` documentation (#200)

* Add `CritiqueTask` documentation

* Align namings and fix typos in `docs/`
---
 docs/index.md                                 |  4 +--
 .../llm/transformers_generate.py              |  2 +-
 .../technical-reference/tasks/prometheus.py   | 13 +++++++
 .../technical-reference/tasks/ultracm.py      |  5 +++
 docs/technical-reference/llms.md              |  6 ++--
 docs/technical-reference/pipeline.md          |  8 ++---
 docs/technical-reference/tasks.md             | 36 ++++++++++++++++++-
 7 files changed, 63 insertions(+), 11 deletions(-)
 create mode 100644 docs/snippets/technical-reference/tasks/prometheus.py
 create mode 100644 docs/snippets/technical-reference/tasks/ultracm.py
diff --git a/docs/index.md b/docs/index.md
index 8607780c8e..dc2cc8d4ba 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -16,7 +16,7 @@ Requires Python 3.8+
 In addition, the following extras are available:
 
 - `hf-transformers`: for using models available in [transformers](https://github.com/huggingface/transformers) package via the `TransformersLLM` integration.
-- `hf-inference-endpoints`: for using the [Hugging Face Inference Endpoints](https://huggingface.co/inference-endpoints) via the `InferenceEndpointsLLM` integration.
+- `hf-inference-endpoints`: for using the [HuggingFace Inference Endpoints](https://huggingface.co/inference-endpoints) via the `InferenceEndpointsLLM` integration.
 - `openai`: for using OpenAI API models via the `OpenAILLM` integration.
 - `vllm`: for using [vllm](https://github.com/vllm-project/vllm) serving engine via the `vLLM` integration.
 - `llama-cpp`: for using [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) as Python bindings for `llama.cpp`.
@@ -56,4 +56,4 @@ For a more complete example, check out our awesome notebook on Google Colab:
 
     Technical description of the classes and functions.
 
-</div>
\ No newline at end of file
+</div>
diff --git a/docs/snippets/technical-reference/llm/transformers_generate.py b/docs/snippets/technical-reference/llm/transformers_generate.py
index 5f1358d46b..8219ca84ce 100644
--- a/docs/snippets/technical-reference/llm/transformers_generate.py
+++ b/docs/snippets/technical-reference/llm/transformers_generate.py
@@ -2,7 +2,7 @@
 from distilabel.tasks import TextGenerationTask
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-# Load the models from huggingface hub:
+# Load the models from the HuggingFace Hub
 tokenizer = AutoTokenizer.from_pretrained("argilla/notus-7b-v1")
 model = AutoModelForCausalLM.from_pretrained("argilla/notus-7b-v1", device_map="auto")
 
diff --git a/docs/snippets/technical-reference/tasks/prometheus.py b/docs/snippets/technical-reference/tasks/prometheus.py
new file mode 100644
index 0000000000..39d5890b07
--- /dev/null
+++ b/docs/snippets/technical-reference/tasks/prometheus.py
@@ -0,0 +1,13 @@
+from distilabel.tasks import PrometheusTask
+
+task = PrometheusTask(
+    system_prompt="You are a fair evaluator language model.",
+    scoring_criteria="Relevance, Grammar, Informativeness, Engagement",
+    score_descriptions={
+        1: "The response is not relevant to the prompt.",
+        2: "The response is relevant to the prompt, but it is not grammatical.",
+        3: "The response is relevant to the prompt and it is grammatical, but it is not informative.",
+        4: "The response is relevant to the prompt, it is grammatical, and it is informative, but it is not engaging.",
+        5: "The response is relevant to the prompt, it is grammatical, it is informative, and it is engaging.",
+    },
+)
diff --git a/docs/snippets/technical-reference/tasks/ultracm.py b/docs/snippets/technical-reference/tasks/ultracm.py
new file mode 100644
index 0000000000..6fd5ef7ef8
--- /dev/null
+++ b/docs/snippets/technical-reference/tasks/ultracm.py
@@ -0,0 +1,5 @@
+from distilabel.tasks import UltraCMTask
+
+task = UltraCMTask(
+    system_prompt="User: A one-turn chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, very detailed, and polite answers to the user's questions.</s>",
+)
diff --git a/docs/technical-reference/llms.md b/docs/technical-reference/llms.md
index 68f5b22bf5..22b0fdf73b 100644
--- a/docs/technical-reference/llms.md
+++ b/docs/technical-reference/llms.md
@@ -107,7 +107,7 @@ For the API reference visit [OpenAILLM][distilabel.llm.openai.OpenAILLM].
 
 ### Llama.cpp
 
-Applicable for local execution of Language Models (LLMs). Utilize this LLM when you have access to the quantized weights of your selected model for interaction.
+Applicable for local execution of Language Models (LLMs). Use this LLM when you have access to the quantized weights of your selected model for interaction.
 
 Let's see an example using [notus-7b-v1](https://huggingface.co/argilla/notus-7b-v1). First, you can download the weights from the following [link](https://huggingface.co/TheBloke/notus-7B-v1-GGUF):
 
@@ -135,7 +135,7 @@ This section explains two different ways to use HuggingFace models:
 
 #### Transformers
 
-This is the option to utilize a model hosted on Hugging Face Hub. Load the model and tokenizer in the standard manner as done locally, and proceed to instantiate your class.
+This is the option to use a model hosted on the HuggingFace Hub. Load the model and tokenizer in the standard manner as done locally, and proceed to instantiate your class.
 
 For the API reference visit [TransformersLLM][distilabel.llm.huggingface.transformers.TransformersLLM].
 
@@ -147,7 +147,7 @@ Let's see an example using [notus-7b-v1](https://huggingface.co/argilla/notus-7b
 
 #### Inference Endpoints
 
-Hugging Face provides a streamlined approach for deploying models through [inference endpoints](https://huggingface.co/inference-endpoints) on their infrastructure. Opt for this solution if your model is hosted on Hugging Face.
+HuggingFace provides a streamlined approach for deploying models through [Inference Endpoints](https://huggingface.co/inference-endpoints) on their infrastructure. Opt for this solution if your model is hosted on the HuggingFace Hub.
 
 For the API reference visit [InferenceEndpointsLLM][distilabel.llm.huggingface.inference_endpoints.InferenceEndpointsLLM].
 
diff --git a/docs/technical-reference/pipeline.md b/docs/technical-reference/pipeline.md
index 97e37551f9..d9d8347c45 100644
--- a/docs/technical-reference/pipeline.md
+++ b/docs/technical-reference/pipeline.md
@@ -12,7 +12,7 @@ Let's start by a Pipeline with a single `LLM` as a generator.
 
 ### Generator
 
-We will create a [`Pipeline`][distilabel.pipeline.Pipeline] that will use [Notus](https://huggingface.co/argilla/notus-7b-v1) from a Huggingface [Inference Endpoint][distilabel.llm.InferenceEndpointsLLM]. For this matter, we need to create a [TextGenerationTask][distilabel.tasks.TextGenerationTask], and specify the format we want to use for our `Prompt`, in this case *notus*, which corresponds to the same for *zephyr*.
+We will create a [`Pipeline`][distilabel.pipeline.Pipeline] that will use [Notus](https://huggingface.co/argilla/notus-7b-v1) from a HuggingFace [Inference Endpoint][distilabel.llm.InferenceEndpointsLLM]. For this matter, we need to create a [TextGenerationTask][distilabel.tasks.TextGenerationTask], and specify the format we want to use for our `Prompt`, in this case *Notus*, which corresponds to the same for *Zephyr*.
 
 ```python
 --8<-- "docs/snippets/technical-reference/pipeline/pipeline_generator_1.py"
@@ -20,7 +20,7 @@ We will create a [`Pipeline`][distilabel.pipeline.Pipeline] that will use [Notus
 
 We've set up our pipeline using a specialized [`TextGenerationTask`](distilabel.tasks.text_generation.base.TextGenerationTask) (refer to the [tasks section](./tasks.md) for more task details), and an [InferenceEndpointsLLM][distilabel.llm.huggingface.inference_endpoints.InferenceEndpointsLLM] configured for [`notus-7b-v1`](https://huggingface.co/argilla/notus-7b-v1), although any of the available `LLMs` will work.
 
-To utilize the [Pipeline][distilabel.pipeline.Pipeline] for dataset generation, we call the generate method. We provide it with the input dataset and specify the desired number of generations. In this example, we've prepared a `Dataset` with a single row to illustrate the process. This dataset contains one row, and we'll trigger 2 generations from it:
+To use the [Pipeline][distilabel.pipeline.Pipeline] for dataset generation, we call the generate method. We provide it with the input dataset and specify the desired number of generations. In this example, we've prepared a `Dataset` with a single row to illustrate the process. This dataset contains one row, and we'll trigger 2 generations from it:
 
 ```python
 --8<-- "docs/snippets/technical-reference/pipeline/pipeline_generator_2.py"
@@ -34,7 +34,7 @@ Now, let's examine the dataset that was generated. It's a [`CustomDataset`][dist
 
 ### Labeller
 
-Next, we move on to labelLing a dataset. Just as before, we need an `LLM` for our `Pipeline`. In this case we will use [`OpenAILLM`][distilabel.llm.openai.OpenAILLM] with `gpt-4`, and a `PreferenceTask`, [UltraFeedbackTask][distilabel.tasks.preference.ultrafeedback.UltraFeedbackTask] for instruction following.
+Next, we move on to labelling a dataset. Just as before, we need an `LLM` for our `Pipeline`. In this case we will use [`OpenAILLM`][distilabel.llm.openai.OpenAILLM] with `gpt-4`, and a `PreferenceTask`, [UltraFeedbackTask][distilabel.tasks.preference.ultrafeedback.UltraFeedbackTask] for instruction following.
 
 ```python
 --8<-- "docs/snippets/technical-reference/pipeline/pipeline_labeller_1.py"
@@ -142,7 +142,7 @@ The API reference can be found here: [pipeline][distilabel.pipeline.pipeline]
 
 ## Argilla integration
 
-The [CustomDataset][distilabel.dataset.CustomDataset] generated entirely by AI models may require some additional human processing. To facilitate human feedback, the dataset can be uploaded to [`Argilla`](https://github.com/argilla-io/argilla). This process involves logging into an [`Argilla`](https://docs.argilla.io/en/latest/getting_started/cheatsheet.html#connect-to-argilla) instance, converting the dataset to the required format using `CustomDataset.to_argilla()`, and subsequently using push_to_argilla on the resulting dataset:
+The [CustomDataset][distilabel.dataset.CustomDataset] generated entirely by AI models may require some additional human processing. To facilitate human feedback, the dataset can be uploaded to [`Argilla`](https://github.com/argilla-io/argilla). This process involves logging into an [`Argilla`](https://docs.argilla.io/en/latest/getting_started/cheatsheet.html#connect-to-argilla) instance, converting the dataset to the required format using `CustomDataset.to_argilla()`, and subsequently using `push_to_argilla` on the resulting dataset:
 
 ```python
 --8<-- "docs/snippets/technical-reference/pipeline/argilla.py"
diff --git a/docs/technical-reference/tasks.md b/docs/technical-reference/tasks.md
index 1105f20957..fa0d4d0258 100644
--- a/docs/technical-reference/tasks.md
+++ b/docs/technical-reference/tasks.md
@@ -53,7 +53,7 @@ For the API reference visit  [SelfInstructTask][distilabel.tasks.text_generation
 
 ## Labelling
 
-Instead of generating text, you can instruct the `LLM` to label datasets. The existing tasks are designed specifically for creating `Preference` datasets.
+Instead of generating text, you can instruct the `LLM` to label datasets. The existing tasks are designed specifically for creating both `PreferenceTask` and `CritiqueTask` datasets.
 
 ### Preference
 
@@ -142,3 +142,37 @@ Which can be directly used in the following way:
 ```
 
 For the API reference visit [UltraJudgeTask][distilabel.tasks.preference.ultrajudge.UltraJudgeTask].
+
+### Critique
+
+The `CritiqueTask` is designed to be a labeller for generated text, while not only adding scores based on a rubric, but also critiques explaining the reasons why those scores have been provided. The critique can either be using a reference answer (gold answer) as e.g. Prometheus does, or just by generating the critique per each of the N provided generations.
+
+The resulting datasets after running a pipeline with the `CritiqueTask` are useful towards either training a model to generate critiques based on the critiques generated by a more powerful model as e.g. GPT-4 from OpenAI, or to be used directly for DPO fine-tuning. The fact that the critique is generated per each pair, a balanced dataset could be generated with individual critiques and their scores, so that then we can e.g. define a threshold on what's considered chosen and rejected, to then run DPO fine-tunes.
+
+While the `CritiqueTask` may seem fairly similar to the `PreferenceTask`, there is a core difference, which is the fact that the critiques are provided per each response or even to a single response, with no need to compare or rate them against each other.
+
+#### UltraCMTask
+
+This task is specifically designed to build the prompts following the format defined in the ["UltraFeedback: Boosting Language Models With High Quality Feedback"](https://arxiv.org/abs/2310.01377) paper.
+
+UltraCM is a model that has been fine-tuned using the UltraFeedback dataset, so as to produce critiques for the generated content, as the authors claim in their paper: "Moreover, since ULTRAFEEDBACK provides detailed textual feedback, we also fine-tune a model that could critique model responses automatically. Our critique model, UltraCM, generates reasonable and detailed comments on various tasks.".
+
+Ideally, the `UltraCMTask` will be more consistent when used with either their fine-tuned model UltraCM or with OpenAI, as both have been proven to produce successfully the structured content following the prompt formatting, and not only structured, but also meaningful and reasonable.
+
+See the following snippet, with an example on how to instantiate the `UltraCMTask` which only requires the system prompt, and it can be modified based on how is the critique intended to be formulated, while the system prompt shown below is the default one as of the UltraFeedback paper.
+
+```python
+--8<-- "docs/snippets/technical-reference/tasks/ultracm.py"
+```
+
+#### PrometheusTask
+
+This task is specifically designed to build the prompts following the format defined in the ["Prometheus: Inducing Fine-grained Evaluation Capability in Language Models"](https://arxiv.org/abs/2310.08491) paper.
+
+Ideally, the `PrometheusTask` should only be used to format the prompts for the Prometheus models as those are the ones that have been fine-tuned to follow the same formatting and will produce consistent results compared to other base models or fine-tuned with different formats. In this case, since the formatting used by Prometheus follows the Llama 2 format, those are recommended. Otherwise, OpenAI has also proved to produce consistent results.
+
+The following snippet can be used out of the box to define a simple `PrometheusTask` with the system prompt, the scoring criteria and the score descriptions, but those can be modified while keeping in mind that Prometheus always expects 5 scores from 1-5 with a meaningful description, as well as with a criteria relevant to the scores defined.
+
+```python
+--8<-- "docs/snippets/technical-reference/tasks/prometheus.py"
+```