distilabel 1.3.0

argilla-io · Aug 6, 2024 · 63f948b · 63f948b
2 parents cf119ad + 3690bd6
commit 63f948b
Show file tree

Hide file tree

Showing 162 changed files with 9,030 additions and 1,129 deletions.
diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml
@@ -4,6 +4,7 @@ on:
   push:
     branches:
       - "main"
+      - "develop"
   pull_request:
 
 concurrency:

diff --git a/.github/workflows/docs-pr-close.yml b/.github/workflows/docs-pr-close.yml
@@ -0,0 +1,35 @@
+name: Clean up PR documentation
+
+on:
+  pull_request:
+    types: [closed]
+
+concurrency: distilabel-docs
+
+jobs:
+  cleanup:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout merged branch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.base.ref }}
+          fetch-depth: 0
+
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: pip install -e .[docs]
+
+      - name: Set git credentials
+        run: |
+          git config --global user.name "${{ github.actor }}"
+          git config --global user.email "${{ github.actor }}@users.noreply.github.com"
+
+      - name: Remove PR documentation
+        run: |
+          PR_NUMBER=${{ github.event.pull_request.number }}
+          mike delete pr-$PR_NUMBER --push
diff --git a/.github/workflows/docs-pr.yml b/.github/workflows/docs-pr.yml
@@ -0,0 +1,81 @@
+name: Publish PR documentation
+
+on:
+  pull_request:
+    types:
+      - opened
+      - synchronize
+
+concurrency: distilabel-docs
+
+jobs:
+  publish:
+    runs-on: ubuntu-latest
+    steps:
+      - name: checkout docs-site
+        uses: actions/checkout@v4
+        with:
+          ref: gh-pages
+
+      - uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+          # Looks like it's not working very well for other people:
+          # https://github.com/actions/setup-python/issues/436
+          # cache: "pip"
+          # cache-dependency-path: pyproject.toml
+
+      - uses: actions/cache@v3
+        id: cache
+        with:
+          path: ${{ env.pythonLocation }}
+          key: ${{ runner.os }}-python-${{ env.pythonLocation }}-${{ hashFiles('pyproject.toml') }}-docs-pr-v00
+
+      - name: Install dependencies
+        if: steps.cache.outputs.cache-hit != 'true'
+        run: pip install -e .[docs]
+
+      - name: Set git credentials
+        run: |
+          git config --global user.name "${{ github.actor }}"
+          git config --global user.email "${{ github.actor }}@users.noreply.github.com"
+
+      - name: Deploy hidden docs for PR
+        run: |
+          PR_NUMBER=$(echo $GITHUB_REF | awk 'BEGIN { FS = "/" } ; { print $3 }')
+          mike deploy pr-$PR_NUMBER --prop-set hidden=true --push
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Comment PR with docs link
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const pr_number = context.payload.pull_request.number;
+            const owner = context.repo.owner;
+            const repo = context.repo.repo;
+
+            // Check if a comment already exists
+            const comments = await github.rest.issues.listComments({
+              issue_number: pr_number,
+              owner: owner,
+              repo: repo
+            });
+
+            const botComment = comments.data.find(comment => 
+              comment.user.type === 'Bot' && 
+              comment.body.includes('Documentation for this PR has been built')
+            );
+
+            if (!botComment) {
+              // Post new comment only if it doesn't exist
+              await github.rest.issues.createComment({
+                issue_number: pr_number,
+                owner: owner,
+                repo: repo,
+                body: `Documentation for this PR has been built. You can view it at: https://distilabel.argilla.io/pr-${pr_number}/`
+              });
+            }
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -8,6 +8,8 @@ on:
     tags:
       - "**"
 
+concurrency: distilabel-docs
+
 jobs:
   publish:
     runs-on: ubuntu-latest
@@ -32,7 +34,7 @@ jobs:
         id: cache
         with:
           path: ${{ env.pythonLocation }}
-          key: ${{ runner.os }}-python-${{ env.pythonLocation }}-${{ hashFiles('pyproject.toml') }}-docs
+          key: ${{ runner.os }}-python-${{ env.pythonLocation }}-${{ hashFiles('pyproject.toml') }}-docs-v00
 
       - name: Install dependencies
         if: steps.cache.outputs.cache-hit != 'true'
@@ -46,9 +48,9 @@ jobs:
       - run: mike deploy dev --push
         if: github.ref == 'refs/heads/develop'
         env:
-          GH_ACCESS_TOKEN: ${{ secrets.GH_ACCESS_TOKEN }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
       - run: mike deploy ${{ github.ref_name }} latest --update-aliases --push
         if: startsWith(github.ref, 'refs/tags/')
         env:
-          GH_ACCESS_TOKEN: ${{ secrets.GH_ACCESS_TOKEN }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -25,7 +25,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
       fail-fast: false
 
     steps:

diff --git a/README.md b/README.md
@@ -29,18 +29,18 @@
 </p>
 
 
-Distilabel is the **framework for synthetic data and AI feedback for AI engineers** that require **high-quality outputs, full data ownership, and overall efficiency**.
+Distilabel is the framework for synthetic data and AI feedback for engineers who need fast, reliable and scalable pipelines based on verified research papers.
 
 If you just want to get started, we recommend you check the [documentation](http://distilabel.argilla.io/). Curious, and want to know more? Keep reading!
 <!-- ![overview](https://github.com/argilla-io/distilabel/assets/36760800/360110da-809d-4e24-a29b-1a1a8bc4f9b7)  -->
 
-## Why use Distilabel?
+## Why use distilabel?
 
-Whether you are working on **a predictive model** that computes semantic similarity or the next **generative model** that is going to beat the LLM benchmarks. Our framework ensures that the **hard data work pays off**. Distilabel is the missing piece that helps you **synthesize data** and provide **AI feedback**.
+Distilabel can be used for generating synthetic data and AI feedback for a wide variety of projects including traditional predictive NLP (classification, extraction, etc.), or generative and large language model scenarios (instruction following, dialogue generation, judging etc.). Distilabel's programmatic approach allows you to build scalable pipelines for data generation and AI feedback. The goal of distilabel is to accelerate your AI development by quickly generating high-quality, diverse datasets based on verified research methodologies for generating and judging with AI feedback.
 
 ### Improve your AI output quality through data quality
 
-Compute is expensive and output quality is important. We help you **focus on data quality**, which tackles the root cause of both of these problems at once. Distilabel helps you to synthesize and judge data to let you spend your valuable time on **achieveing and keeping high-quality standards for your data**.
+Compute is expensive and output quality is important. We help you **focus on data quality**, which tackles the root cause of both of these problems at once. Distilabel helps you to synthesize and judge data to let you spend your valuable time **achieving and keeping high-quality standards for your data**.
 
 ### Take control of your data and models
 
@@ -62,7 +62,7 @@ We are an open-source community-driven project and we love to hear from you. Her
 
 ## What do people build with Distilabel?
 
-Distilabel is a tool that can be used to **synthesize data and provide AI feedback**. Our community uses Distilabel to create amazing [datasets](https://huggingface.co/datasets?other=distilabel) and [models](https://huggingface.co/models?other=distilabel), and **we love contributions to open-source** ourselves too.
+The Argilla community uses distilabel to create amazing [datasets](https://huggingface.co/datasets?other=distilabel) and [models](https://huggingface.co/models?other=distilabel).
 
 - The [1M OpenHermesPreference](https://huggingface.co/datasets/argilla/OpenHermesPreferences) is a dataset of ~1 million AI preferences derived from teknium/OpenHermes-2.5. It shows how we can use Distilabel to **synthesize data on an immense scale**.
 - Our [distilabeled Intel Orca DPO dataset](https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs) and the [improved OpenHermes model](https://huggingface.co/argilla/distilabeled-OpenHermes-2.5-Mistral-7B), show how we **improve model performance by filtering out 50%** of the original dataset through **AI feedback**.
@@ -74,7 +74,7 @@ Distilabel is a tool that can be used to **synthesize data and provide AI feedba
 pip install distilabel --upgrade
 ```
 
-Requires Python 3.8+
+Requires Python 3.9+
 
 In addition, the following extras are available:
 
@@ -105,14 +105,14 @@ Then run:
 ```python
 from distilabel.llms import OpenAILLM
 from distilabel.pipeline import Pipeline
-from distilabel.steps import LoadHubDataset
+from distilabel.steps import LoadDataFromHub
 from distilabel.steps.tasks import TextGeneration
 
 with Pipeline(
     name="simple-text-generation-pipeline",
     description="A simple text generation pipeline",
 ) as pipeline:
-    load_dataset = LoadHubDataset(output_mappings={"prompt": "instruction"})
+    load_dataset = LoadDataFromHub(output_mappings={"prompt": "instruction"})
 
     generate_with_openai = TextGeneration(llm=OpenAILLM(model="gpt-3.5-turbo"))
 

diff --git a/docs/api/mixins/requirements.md b/docs/api/mixins/requirements.md
@@ -0,0 +1 @@
+::: distilabel.mixins.requirements.RequirementsMixin
diff --git a/docs/api/mixins/runtime_parameters.md b/docs/api/mixins/runtime_parameters.md
@@ -0,0 +1 @@
+::: distilabel.mixins.runtime_parameters.RuntimeParametersMixin
diff --git a/docs/api/step/generator_step.md b/docs/api/step/generator_step.md
@@ -5,3 +5,5 @@ This section contains the API reference for the [`GeneratorStep`][distilabel.ste
 For more information and examples on how to use existing generator steps or create custom ones, please refer to [Tutorial - Step - GeneratorStep](../../sections/how_to_guides/basic/step/generator_step.md).
 
 ::: distilabel.steps.base.GeneratorStep
+
+::: distilabel.steps.generators.utils.make_generator_step
diff --git a/docs/api/step/resources.md b/docs/api/step/resources.md
@@ -0,0 +1,3 @@
+# StepResources
+
+::: distilabel.steps.base.StepResources
diff --git a/docs/api/step_gallery/columns.md b/docs/api/step_gallery/columns.md
@@ -2,6 +2,7 @@
 
 This section contains the existing steps intended to be used for common column operations to apply to the batches.
 
-::: distilabel.steps.combine
-::: distilabel.steps.expand
-::: distilabel.steps.keep
+::: distilabel.steps.columns.expand
+::: distilabel.steps.columns.keep
+::: distilabel.steps.columns.merge
+::: distilabel.steps.columns.group
diff --git a/docs/index.md b/docs/index.md
@@ -36,29 +36,29 @@ hide:
   </a>
 </p>
 
-Distilabel is the **framework for synthetic data and AI feedback for AI engineers** that require **high-quality outputs, full data ownership, and overall efficiency**.
+Distilabel is the framework for synthetic data and AI feedback for engineers who need fast, reliable and scalable pipelines based on verified research papers.
 
 If you just want to get started, we recommend you check the [documentation](http://distilabel.argilla.io/). Curious, and want to know more? Keep reading!
 
-## Why use Distilabel?
+## Why use distilabel?
 
-Whether you are working on **a predictive model** that computes semantic similarity or the next **generative model** that is going to beat the LLM benchmarks. Our framework ensures that the **hard data work pays off**. Distilabel is the missing piece that helps you **synthesize data** and provide **AI feedback**.
+Distilabel can be used for generating synthetic data and AI feedback for a wide variety of projects including traditional predictive NLP (classification, extraction, etc.), or generative and large language model scenarios (instruction following, dialogue generation, judging etc.). Distilabel's programmatic approach allows you to build scalable pipelines for data generation and AI feedback. The goal of distilabel is to accelerate your AI development by quickly generating high-quality, diverse datasets based on verified research methodologies for generating and judging with AI feedback.
 
 ### Improve your AI output quality through data quality
 
-Compute is expensive and output quality is important. We help you **focus on data quality**, which tackles the root cause of both of these problems at once. Distilabel helps you to synthesize and judge data to let you spend your valuable time on **achieveing and keeping high-quality standards for your data**.
+Compute is expensive and output quality is important. We help you **focus on data quality**, which tackles the root cause of both of these problems at once. Distilabel helps you to synthesize and judge data to let you spend your valuable time **achieving and keeping high-quality standards for your synthetic data**.
 
 ### Take control of your data and models
 
-**Ownership of data for fine-tuning your own LLMs** is not easy but Distilabel can help you to get started. We integrate **AI feedback from any LLM provider out there** using one unified API.
+**Ownership of data for fine-tuning your own LLMs** is not easy but distilabel can help you to get started. We integrate **AI feedback from any LLM provider out there** using one unified API.
 
 ### Improve efficiency by quickly iterating on the right research and LLMs
 
 Synthesize and judge data with **latest research papers** while ensuring **flexibility, scalability and fault tolerance**. So you can focus on improving your data and training your models.
 
-## What do people build with Distilabel?
+## What do people build with distilabel?
 
-Distilabel is a tool that can be used to **synthesize data and provide AI feedback**. Our community uses Distilabel to create amazing [datasets](https://huggingface.co/datasets?other=distilabel) and [models](https://huggingface.co/models?other=distilabel), and **we love contributions to open-source** ourselves too.
+The Argilla community uses distilabel to create amazing [datasets](https://huggingface.co/datasets?other=distilabel) and [models](https://huggingface.co/models?other=distilabel).
 
 - The [1M OpenHermesPreference](https://huggingface.co/datasets/argilla/OpenHermesPreferences) is a dataset of ~1 million AI preferences derived from teknium/OpenHermes-2.5. It shows how we can use Distilabel to **synthesize data on an immense scale**.
 - Our [distilabeled Intel Orca DPO dataset](https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs) and the [improved OpenHermes model](https://huggingface.co/argilla/distilabeled-OpenHermes-2.5-Mistral-7B), show how we **improve model performance by filtering out 50%** of the original dataset through **AI feedback**.

diff --git a/docs/scripts/gen_popular_issues.py b/docs/scripts/gen_popular_issues.py
@@ -24,7 +24,8 @@
 REPOSITORY = "argilla-io/distilabel"
 DATA_PATH = "sections/community/popular_issues.md"
 
-GITHUB_ACCESS_TOKEN = os.getenv("GH_ACCESS_TOKEN")  # public_repo and read:org scopes are required
+# public_repo and read:org scopes are required
+GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
 
 
 def fetch_data_from_github(repository, auth_token):
@@ -79,7 +80,7 @@ def fetch_data_from_github(repository, auth_token):
 
 
 with mkdocs_gen_files.open(DATA_PATH, "w") as f:
-    df = fetch_data_from_github(REPOSITORY, GITHUB_ACCESS_TOKEN)
+    df = fetch_data_from_github(REPOSITORY, GITHUB_TOKEN)
 
     open_issues = df.loc[df["State"] == "open"]
     engagement_df = (

diff --git a/docs/sections/getting_started/installation.md b/docs/sections/getting_started/installation.md
@@ -9,7 +9,7 @@ hide:
 !!! NOTE
     Since `distilabel` v1.0.0 was recently released, we refactored most of the stuff, so the installation below only applies to `distilabel` v1.0.0 and above.
 
-You will need to have at least Python 3.8 or higher, up to Python 3.12, since support for the latter is still a work in progress.
+You will need to have at least Python 3.9 or higher, up to Python 3.12, since support for the latter is still a work in progress.
 
 To install the latest release of the package from PyPI you can use the following command:
 
@@ -46,7 +46,7 @@ Additionally, as part of `distilabel` some extra dependencies are available, mai
 
 - `llama-cpp`: for using [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) Python bindings for `llama.cpp` via the `LlamaCppLLM` integration.
 
-- `mistralai`: for using models available in [Mistral AI API](https://mistral.ai/news/la-plateforme/) via the `MistralAILLM` integration. Note that the [`mistralai` Python client](https://github.com/mistralai/client-python) can only be installed from Python 3.9 onwards, so this is the only `distilabel` dependency that's not supported in Python 3.8.
+- `mistralai`: for using models available in [Mistral AI API](https://mistral.ai/news/la-plateforme/) via the `MistralAILLM` integration.
 
 - `ollama`: for using [Ollama](https://ollama.com/) and their available models via `OllamaLLM` integration.
 

diff --git a/docs/sections/getting_started/quickstart.md b/docs/sections/getting_started/quickstart.md
@@ -67,3 +67,31 @@ if __name__ == "__main__":
 7. We run the pipeline with the parameters for the `load_dataset` and `text_generation` steps. The `load_dataset` step will use the repository `distilabel-internal-testing/instruction-dataset-mini` and the `test` split, and the `text_generation` task will use the `generation_kwargs` with the `temperature` set to `0.7` and the `max_new_tokens` set to `512`.
 
 8. Optionally, we can push the generated [`Distiset`][distilabel.distiset.Distiset] to the Hugging Face Hub repository `distilabel-example`. This will allow you to share the generated dataset with others and use it in other pipelines.
+
+## Minimal example
+
+`distilabel` gives a lot of flexibility to create your pipelines, but to start right away, you can omit a lot of the details and let default values:
+
+```python
+from distilabel.llms import InferenceEndpointsLLM
+from distilabel.pipeline import Pipeline
+from distilabel.steps.tasks import TextGeneration
+from datasets import load_dataset
+
+
+dataset = load_dataset("distilabel-internal-testing/instruction-dataset-mini", split="test")
+
+with Pipeline() as pipeline:  # (1)
+    TextGeneration(llm=InferenceEndpointsLLM(model_id="meta-llama/Meta-Llama-3.1-8B-Instruct"))  # (2)
+
+
+if __name__ == "__main__":    
+    distiset = pipeline.run(dataset=dataset)  # (3)
+    distiset.push_to_hub(repo_id="distilabel-example")
+```
+
+1. The [`Pipeline`][distilabel.pipeline.Pipeline] can take no arguments and generate a default name on it's own that will be tracked internally.
+
+2. Just as with the [`Pipeline`][distilabel.pipeline.Pipeline], the [`Step`][distilabel.steps.base.Step]s don't explicitly need a name.
+
+3. You can generate the dataset as you would normally do with Hugging Face and pass the dataset to the run method.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		::: distilabel.mixins.requirements.RequirementsMixin
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		::: distilabel.mixins.runtime_parameters.RuntimeParametersMixin
Original file line number	Diff line number	Diff line change
Expand Up		@@ -5,3 +5,5 @@ This section contains the API reference for the [`GeneratorStep`][distilabel.ste
		For more information and examples on how to use existing generator steps or create custom ones, please refer to [Tutorial - Step - GeneratorStep](../../sections/how_to_guides/basic/step/generator_step.md).

		::: distilabel.steps.base.GeneratorStep

		::: distilabel.steps.generators.utils.make_generator_step
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# StepResources

		::: distilabel.steps.base.StepResources